In [1]:
import numpy as np
import pandas as pd
import torch
import sys

# from sklearn.manifold import TSNE
# import plotly.graph_objects as go

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
WEIGHTS_DIR ="weights/cbow_histaware_"

## Getting Embeddings

In [3]:
def get_embedding(model):
    # embedding from first model layer
    embeddings = list(model.parameters())[0]
    embeddings = embeddings.cpu().detach().numpy()

    # normalization
    norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
    norms = np.reshape(norms, (len(norms), 1))
    embeddings_norm = embeddings / norms
    print(embeddings_norm.shape)
    return embeddings_norm

# Find Similar Words

In [4]:
def get_top_similar(word: str, embeddings_norm, vocab, topN: int = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [5]:
def find_similar(word, year_folder,topN=10):
    model = torch.load(f"../{year_folder}/model.pt", map_location=device)
    vocab = torch.load(f"../{year_folder}/vocab.pt")
    embeddings_norm = get_embedding(model)
    topnwords = [word for word, sim in get_top_similar(word,embeddings_norm, vocab, topN).items()]
    print(topnwords)
    return topnwords
   

## Similar words - aardgas

### Similar words - gas 1963

In [36]:
year_folder = WEIGHTS_DIR+"63"
sim_word_gas_63 = find_similar('aardgas', year_folder)

(28826, 300)
['gas', 'aardgasvondst', 'gasveld', 'Noorden', 'bodemschatten', 'aardgasvoorraden', 'gasbel', 'bitumina', 'aardgasreserve', 'uranium']


### Similar words - gas 1967

In [37]:
year_folder = WEIGHTS_DIR+"67"
sim_word_gas_67 = find_similar('aardgas', year_folder)

(28586, 300)
['gas', 'gasveld', 'Slochterengas', 'kernenergie', 'gaa', 'delfstof', 'uranium', 'aardgasnet', 'winningsvergunning', 'blokverwarming']


### Similar words - gas 1971

In [38]:
year_folder = WEIGHTS_DIR+"71"
sim_word_gas_71 = find_similar('aardgas', year_folder)

(28473, 300)
['gas', 'gasveld', 'zwavelarme', 'magnesiumzout', 'monopolie', 'aardgasnet', 'bruinkool', 'aardgasveld', '31,4', 'benzine']


### Similar words - gas 1975

In [39]:
year_folder = WEIGHTS_DIR+"75"
sim_word_gas_75 = find_similar('aardgas', year_folder)

(26716, 300)
['gas', 'Noordzeegas', 'gasveld', 'olieproducten', 'aardgasnet', 'steenkool', 'aardgasveld', 'gasbel', 'gasnet', 'olieprodukten']


### Similar words - gas 1979

In [40]:
year_folder = WEIGHTS_DIR+"79"
sim_word_gas_79 = find_similar('aardgas', year_folder)

(21924, 300)
['gas', 'propaangas', 'gasveld', 'Noordzeegas', 'aardolieprodukten', 'LPG', 'olieprodukten', 'gasvormige', 'energie', 'kolengas']


### Similar words - gas 1983

In [41]:
year_folder = WEIGHTS_DIR+"83"
sim_word_gas_83 = find_similar('aardgas', year_folder)

(37265, 300)
['gas', 'exportgas', 'gasbel', 'aardgaswinning', 'grondstoffen', 'servicekosten', 'energie', 'gasvelden', 'gasveld', 'gasverkoop']


### Similar words - gas 1987

In [42]:
year_folder = WEIGHTS_DIR+"87"
sim_word_gas_87 = find_similar('aardgas', year_folder)

(44278, 300)
['gas', 'stortgas', 'gasverkoop', 'scheepvaartverkeer', 'aardgascondensaat', 'propaangas', 'biogas', 'olieproducten', 'steenzout', 'benzine']


### Similar words - gas 1991

In [43]:
year_folder = WEIGHTS_DIR+"91"
sim_word_gas_91 = find_similar('aardgas', year_folder)

(40437, 300)
['gas', 'bruinkool', '73,6', 'zonne-', 'waterstof', 'tolueen', 'propaan', '2.400', 'gasreserve', 'benzine']


### Similar words - gas 1995

In [44]:
year_folder = WEIGHTS_DIR+"95"
sim_word_gas_95 = find_similar('aardgas', year_folder)

(26882, 300)
['gas', 'lpg', 'brandstoffen', 'reststoffen', 'energie', 'waddengas', 'aardolieprodukten', 'ingevoerde', 'verplaatst', 'gasvoorraden']


## Similar words - aardolie

### Similar words - aardolie 1963

In [15]:
year_folder = WEIGHTS_DIR+"63"
sim_word_aardolie_63 = find_similar('aardolie', year_folder)

(28826, 300)
['olie', 'bruinkool', 'delfstoffen', 'olleprodukten', 'verscheping', 'oliebronnen', 'waterstof', 'bauxiet', 'afzet', 'grondstof']


### Similar words - aardolie 1967

In [16]:
year_folder = WEIGHTS_DIR+"67"
sim_word_aardolie_67 = find_similar('aardolie', year_folder)

(28586, 300)
['olie', 'distributie', 'brutoproduktie', 'olie-', 'goedkoop', 'koolwaterstoffen', 'bruinkool', 'voedingsmiddelen', 'zwavel', 'Noordzeebodem']


### Similar words - aardolie 1971

In [17]:
year_folder = WEIGHTS_DIR+"71"
sim_word_aardolie_71 = find_similar('aardolie', year_folder)

(28473, 300)
['olie', 'grondstoffen', 'stookolie', 'granen', 'Brunei', 'ertsen', 'staal', 'olie-aanvoer', 'olie-', 'vervaardiging']


### Similar words - aardolie 1975

In [18]:
year_folder = WEIGHTS_DIR+"75"
sim_word_aardolie_75 = find_similar('aardolie', year_folder)

(26716, 300)
['olie', 'synthetisch', 'bruinkool', 'gasolle', 'West-Europa', 'lood', 'stookolie', 'West-Duitsland', 'oliereserves', 'eindprodukten']


### Similar words - aardolie 1979

In [19]:
year_folder = WEIGHTS_DIR+"79"
sim_word_aardolie_79 = find_similar('aardolie', year_folder)

(21924, 300)
['olie', 'aardolie-', 'vliegtuigen', 'ertsen', 'methaan', 'steenkoolgas', 'grondstoffen', 'stookolie', 'olie-', 'energiedragers']


### Similar words - aardolie 1983

In [20]:
year_folder = WEIGHTS_DIR+"83"
sim_word_aardolie_83 = find_similar('aardolie', year_folder)

(37265, 300)
['olie', 'stookolie', 'kapitaalgoederen', 'vloeistoffen', 'grondstof', 'produkten', 'kolengas', 'bruinkool', 'gasvelden', 'ex-dlvidend']


### Similar words - aardolie 1987

In [21]:
year_folder = WEIGHTS_DIR+"87"
sim_word_aardolie_87 = find_similar('aardolie', year_folder)

(44278, 300)
['olie', 'olie-', 'edelmetalen', 'stookolie', 'veevoeders', 'residu', 'olieprodukten', 'energieprodukten', 'ijzererts', 'ertsen']


### Similar words - aardolie 1991

In [22]:
year_folder = WEIGHTS_DIR+"91"
sim_word_aardolie_91 = find_similar('aardolie', year_folder)

(40437, 300)
['olie', 'dieselolie', 'ohe', 'geachte', 'plastics', 'stookolie', 'ethyleen', 'nettoproduktie', 'componenten', 'afvoer']


### Similar words - aardolie 1995

In [23]:
year_folder = WEIGHTS_DIR+"95"
sim_word_aardolie_95 = find_similar('aardolie', year_folder)

(26882, 300)
['olie', 'ertsen', 'Opsporing', 'massagoederen', 'ƒ37,10', 'energie', 'ohe', 'aluminium', 'bruinkool', 'dieselolie']


## Similar words - steenkool

### Similar words - steenkool 1963

In [24]:
year_folder = WEIGHTS_DIR+"63"
sim_word_steenkool_63 = find_similar('steenkool', year_folder)

(28826, 300)
['kolen', 'steenkolen', 'anthraciet', 'huisbrandkolen', 'ruwe', 'draagvermogen', 'Sovjet-Unie', 'kernenergie', 'ijzererts', 'stikstof']


### Similar words - steenkool 1967

In [25]:
year_folder = WEIGHTS_DIR+"67"
sim_word_steenkool_67 = find_similar('steenkool', year_folder)

(28586, 300)
['kolen', '32.000', 'cokes', 'industriekolen', 'huisbrandkolen', 'Emma', 'steenkolen', 'huisbrand', 'mijnindustrie', 'Groot-Brittannië']


### Similar words - steenkool 1971

In [26]:
year_folder = WEIGHTS_DIR+"71"
sim_word_steenkool_71 = find_similar('steenkool', year_folder)

(28473, 300)
['kolen', 'cokes', 'steenkolen', 'zwavel', 'aardgas', 'import', 'koolstof', 'benzine', 'huisbrand', 'zwavelarme']


### Similar words - steenkool 1975

In [27]:
year_folder = WEIGHTS_DIR+"75"
sim_word_steenkool_75 = find_similar('steenkool', year_folder)

(26716, 300)
['kolen', 'bruinkool', 'mineralen', 'steenkolen', 'leisteen', 'teerzand', 'elektriciteit', 'delfstoffen', 'aardgas', 'cokes']


### Similar words - steenkool 1979

In [28]:
year_folder = WEIGHTS_DIR+"79"
sim_word_steenkool_79 = find_similar('steenkool', year_folder)

(21924, 300)
['kolen', 'steenkolen', 'teerzand', 'steenkoollagen', 'aardwarmte', 'bruinkool', 'mijnen', 'Duinkerken', 'zonneenergie', 'aardolie']


### Similar words - steenkool 1983

In [29]:
year_folder = WEIGHTS_DIR+"83"
sim_word_steenkool_83 = find_similar('steenkool', year_folder)

(37265, 300)
['kolen', 'steenkolen', 'teerzand', 'bruinkool', 'methanol', 'steen-', 'fosfaten', 'olie-equivalent', 'ethanol', 'cadmium']


### Similar words - steenkool 1987

In [30]:
year_folder = WEIGHTS_DIR+"87"
sim_word_steenkool_87 = find_similar('steenkool', year_folder)

(44278, 300)
['kolen', 'steenkolen', 'bruinkool', 'mica', 'delfstoffen', 'mineralen', 'landbouwprodukten', 'platina', 'Envoy', 'industrieprodukten']


### Similar words - steenkool 1991

In [31]:
year_folder = WEIGHTS_DIR+"91"
sim_word_steenkool_91 = find_similar('steenkool', year_folder)

(40437, 300)
['kolen', 'steenkolen', 'platina', 'kassen', 'cadmium', 'uitvoer', 'goudmijnen', 'huishoudingen', 'kalk', 'suiker']


### Similar words - steenkool 1995

In [32]:
year_folder = WEIGHTS_DIR+"95"
sim_word_steenkool_95 = find_similar('steenkool', year_folder)

(26882, 300)
['kolen', 'landbouwprodukten', 'graan', 'kunstmest', 'upstream', 'Unox', 'oplosmiddelen', '260.000', 'oliehoudende', 'excl']


## Similar words - Gas

In [49]:
pd.set_option('display.max_colwidth', 255)
d = {'years':['1960-63','1964-67','1968-71','1972-75','1976-79','1980-83','1984-87','1988-91','1992-95'],
     'Similar words - aardgas':[sim_word_gas_63,sim_word_gas_67,sim_word_gas_71,sim_word_gas_75,sim_word_gas_79,sim_word_gas_83,sim_word_gas_87
                      ,sim_word_gas_91,sim_word_gas_95]}
df_gas = pd.DataFrame(data=d)
df_gas.to_csv('gas.csv')

## Similar words - aardolie

In [50]:
pd.set_option('display.max_colwidth', 255)
d = {'years':['1960-63','1964-67','1968-71','1972-75','1976-79','1980-83','1984-87','1988-91','1992-95'],
     'Similar words - aardolie':[sim_word_aardolie_63,sim_word_aardolie_67,sim_word_aardolie_71,sim_word_aardolie_75,sim_word_aardolie_79,sim_word_aardolie_83,sim_word_aardolie_87
                      ,sim_word_gas_91,sim_word_gas_95]}
df_oil =pd.DataFrame(data=d)
df_oil.to_csv('aardolie.csv')

## Similar words - steenkool

In [51]:
pd.set_option('display.max_colwidth', 255)
d = {'years':['1960-63','1964-67','1968-71','1972-75','1976-79','1980-83','1984-87','1988-91','1992-95'],
     'Similar words - steenkool':[sim_word_steenkool_63,sim_word_steenkool_67,sim_word_steenkool_71,sim_word_steenkool_75,sim_word_steenkool_79,sim_word_steenkool_83,sim_word_steenkool_87
                      ,sim_word_gas_91,sim_word_gas_95]}
df_coal = pd.DataFrame(data=d)
df_coal.to_csv('steenkool.csv')