In [25]:
import numpy as np
import pandas as pd
import torch
import sys

# from sklearn.manifold import TSNE
# import plotly.graph_objects as go

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [26]:
WEIGHTS_DIR ="weights/cbow_histaware_"

## Getting Embeddings

In [28]:
def get_embedding(model):
    # embedding from first model layer
    embeddings = list(model.parameters())[0]
    embeddings = embeddings.cpu().detach().numpy()

    # normalization
    norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
    norms = np.reshape(norms, (len(norms), 1))
    embeddings_norm = embeddings / norms
    print(embeddings_norm.shape)
    return embeddings_norm

# Find Similar Words

In [29]:
def get_top_similar(word: str, embeddings_norm, vocab, topN: int = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [30]:
def find_similar(word, year_folder,topN=10):
    model = torch.load(f"../{year_folder}/model.pt", map_location=device)
    vocab = torch.load(f"../{year_folder}/vocab.pt")
    embeddings_norm = get_embedding(model)
    topnwords = [word for word, sim in get_top_similar(word,embeddings_norm, vocab, topN).items()]
    print(topnwords)
    return topnwords
   

## Similar words - gas

### Similar words - gas 1963

In [54]:
year_folder = WEIGHTS_DIR+"63"
sim_word_gas_63 = find_similar('gas', year_folder)

(28826, 300)
['aardgas', 'propaangas', 'opwekken', 'gasfornuis', 'gaa', 'gas-', 'zeewater', 'gassen', 'water', 'beton']


### Similar words - gas 1967

In [55]:
year_folder = WEIGHTS_DIR+"67"
sim_word_gas_67 = find_similar('gas', year_folder)

(28586, 300)
['aardgas', 'koolzuur', 'water', 'petroleum', 'gasveld', 'veld', 'gaa', 'methaan', 'vuur', 'gassen']


### Similar words - gas 1971

In [56]:
year_folder = WEIGHTS_DIR+"71"
sim_word_gas_71 = find_similar('gas', year_folder)

(28473, 300)
['aardgas', 'chloorgas', 'propaangas', 'benzine', 'gas-', 'stoom', 'gassen', 'koelwater', 'kalium-', 'magnesiumzout']


### Similar words - gas 1975

In [57]:
year_folder = WEIGHTS_DIR+"75"
sim_word_gas_75 = find_similar('gas', year_folder)

(26716, 300)
['aardgas', 'propaangas', 'poeder', 'Noordzeegas', 'methaangas', 'olieproducten', 'teerzand', 'koolzuur', 'leisteen', 'vinylchloride']


### Similar words - gas 1979

In [58]:
year_folder = WEIGHTS_DIR+"79"
sim_word_gas_79 = find_similar('gas', year_folder)

(21924, 300)
['aardgas', 'propaangas', 'gasveld', 'propaan', 'formaldehydegas', 'gassen', 'stoom', 'spul', 'gat', 'warmte']


### Similar words - gas 1983

In [59]:
year_folder = WEIGHTS_DIR+"83"
sim_word_gas_83 = find_similar('gas', year_folder)

(37265, 300)
['aardgas', 'Gas', 'gas-', 'petroleumgas', 'methaangas', 'wanbetaling', 'gasveld', 'afvalgassen', 'aardgas-', 'dampen']


### Similar words - gas 1987

In [60]:
year_folder = WEIGHTS_DIR+"87"
sim_word_gas_87 = find_similar('gas', year_folder)

(44278, 300)
['aardgas', 'boorgruis', 'aas', 'drinkwater', 'gassen', 'helium', 'gifgas', 'biogas', 'gas-', 'methaangas']


### Similar words - gas 1991

In [61]:
year_folder = WEIGHTS_DIR+"91"
sim_word_gas_91 = find_similar('gas', year_folder)

FileNotFoundError: [Errno 2] No such file or directory: '../weights/cbow_histaware_91/model.pt'

### Similar words - gas 1995

In [62]:
year_folder = WEIGHTS_DIR+"95"
sim_word_gas_95 = find_similar('gas', year_folder)

FileNotFoundError: [Errno 2] No such file or directory: '../weights/cbow_histaware_95/model.pt'

## Similar words - aardolie

### Similar words - aardolie 1963

In [63]:
year_folder = WEIGHTS_DIR+"63"
sim_word_aardolie_63 = find_similar('aardolie', year_folder)

(28826, 300)
['olie', 'bruinkool', 'delfstoffen', 'olleprodukten', 'verscheping', 'oliebronnen', 'waterstof', 'bauxiet', 'afzet', 'grondstof']


### Similar words - aardolie 1967

In [64]:
year_folder = WEIGHTS_DIR+"67"
sim_word_aardolie_67 = find_similar('aardolie', year_folder)

(28586, 300)
['olie', 'distributie', 'brutoproduktie', 'olie-', 'goedkoop', 'koolwaterstoffen', 'bruinkool', 'voedingsmiddelen', 'zwavel', 'Noordzeebodem']


### Similar words - aardolie 1971

In [65]:
year_folder = WEIGHTS_DIR+"71"
sim_word_aardolie_71 = find_similar('aardolie', year_folder)

(28473, 300)
['olie', 'grondstoffen', 'stookolie', 'granen', 'Brunei', 'ertsen', 'staal', 'olie-aanvoer', 'olie-', 'vervaardiging']


### Similar words - aardolie 1975

In [66]:
year_folder = WEIGHTS_DIR+"75"
sim_word_aardolie_75 = find_similar('aardolie', year_folder)

(26716, 300)
['olie', 'synthetisch', 'bruinkool', 'gasolle', 'West-Europa', 'lood', 'stookolie', 'West-Duitsland', 'oliereserves', 'eindprodukten']


### Similar words - aardolie 1979

In [67]:
year_folder = WEIGHTS_DIR+"79"
sim_word_aardolie_79 = find_similar('aardolie', year_folder)

(21924, 300)
['olie', 'aardolie-', 'vliegtuigen', 'ertsen', 'methaan', 'steenkoolgas', 'grondstoffen', 'stookolie', 'olie-', 'energiedragers']


### Similar words - aardolie 1983

In [68]:
year_folder = WEIGHTS_DIR+"83"
sim_word_aardolie_83 = find_similar('aardolie', year_folder)

(37265, 300)
['olie', 'stookolie', 'kapitaalgoederen', 'vloeistoffen', 'grondstof', 'produkten', 'kolengas', 'bruinkool', 'gasvelden', 'ex-dlvidend']


### Similar words - aardolie 1987

In [69]:
year_folder = WEIGHTS_DIR+"87"
sim_word_aardolie_87 = find_similar('aardolie', year_folder)

(44278, 300)
['olie', 'olie-', 'edelmetalen', 'stookolie', 'veevoeders', 'residu', 'olieprodukten', 'energieprodukten', 'ijzererts', 'ertsen']


### Similar words - aardolie 1991

In [70]:
year_folder = WEIGHTS_DIR+"91"
sim_word_aardolie_91 = find_similar('aardolie', year_folder)

FileNotFoundError: [Errno 2] No such file or directory: '../weights/cbow_histaware_91/model.pt'

### Similar words - aardolie 1995

In [None]:
year_folder = WEIGHTS_DIR+"95"
sim_word_aardolie_95 = find_similar('aardolie', year_folder)

## Similar words - steenkool

### Similar words - steenkool 1963

In [71]:
year_folder = WEIGHTS_DIR+"63"
sim_word_steenkool_63 = find_similar('steenkool', year_folder)

(28826, 300)
['kolen', 'steenkolen', 'anthraciet', 'huisbrandkolen', 'ruwe', 'draagvermogen', 'Sovjet-Unie', 'kernenergie', 'ijzererts', 'stikstof']


### Similar words - steenkool 1967

In [72]:
year_folder = WEIGHTS_DIR+"67"
sim_word_steenkool_67 = find_similar('steenkool', year_folder)

(28586, 300)
['kolen', '32.000', 'cokes', 'industriekolen', 'huisbrandkolen', 'Emma', 'steenkolen', 'huisbrand', 'mijnindustrie', 'Groot-Brittannië']


### Similar words - steenkool 1971

In [73]:
year_folder = WEIGHTS_DIR+"71"
sim_word_steenkool_71 = find_similar('steenkool', year_folder)

(28473, 300)
['kolen', 'cokes', 'steenkolen', 'zwavel', 'aardgas', 'import', 'koolstof', 'benzine', 'huisbrand', 'zwavelarme']


### Similar words - steenkool 1975

In [74]:
year_folder = WEIGHTS_DIR+"75"
sim_word_steenkool_75 = find_similar('steenkool', year_folder)

(26716, 300)
['kolen', 'bruinkool', 'mineralen', 'steenkolen', 'leisteen', 'teerzand', 'elektriciteit', 'delfstoffen', 'aardgas', 'cokes']


### Similar words - steenkool 1979

In [75]:
year_folder = WEIGHTS_DIR+"79"
sim_word_steenkool_79 = find_similar('steenkool', year_folder)

(21924, 300)
['kolen', 'steenkolen', 'teerzand', 'steenkoollagen', 'aardwarmte', 'bruinkool', 'mijnen', 'Duinkerken', 'zonneenergie', 'aardolie']


### Similar words - steenkool 1983

In [76]:
year_folder = WEIGHTS_DIR+"83"
sim_word_steenkool_83 = find_similar('steenkool', year_folder)

(37265, 300)
['kolen', 'steenkolen', 'teerzand', 'bruinkool', 'methanol', 'steen-', 'fosfaten', 'olie-equivalent', 'ethanol', 'cadmium']


### Similar words - steenkool 1987

In [77]:
year_folder = WEIGHTS_DIR+"87"
sim_word_steenkool_87 = find_similar('steenkool', year_folder)

(44278, 300)
['kolen', 'steenkolen', 'bruinkool', 'mica', 'delfstoffen', 'mineralen', 'landbouwprodukten', 'platina', 'Envoy', 'industrieprodukten']


### Similar words - steenkool 1991

In [78]:
year_folder = WEIGHTS_DIR+"91"
sim_word_steenkool_91 = find_similar('steenkool', year_folder)

FileNotFoundError: [Errno 2] No such file or directory: '../weights/cbow_histaware_91/model.pt'

### Similar words - steenkool 1995

In [None]:
year_folder = WEIGHTS_DIR+"95"
sim_word_steenkool_95 = find_similar('steenkool', year_folder)

## Similar words - Gas

In [81]:
pd.set_option('display.max_colwidth', 255)
d = {'years':['1960-63','1964-67','1968-71','1972-75','1976-79','1980-83','1984-87','1988-91','1992-95'],
     'Similar words - Gas':[sim_word_gas_63,sim_word_gas_67,sim_word_gas_71,sim_word_gas_75,sim_word_gas_79,sim_word_gas_83,sim_word_gas_87, None, None]}
                      #,sim_word_gas_91,sim_word_gas_95]}
pd.DataFrame(data=d)

Unnamed: 0,years,Similar words - Gas
0,1960-63,"[aardgas, propaangas, opwekken, gasfornuis, gaa, gas-, zeewater, gassen, water, beton]"
1,1964-67,"[aardgas, koolzuur, water, petroleum, gasveld, veld, gaa, methaan, vuur, gassen]"
2,1968-71,"[aardgas, chloorgas, propaangas, benzine, gas-, stoom, gassen, koelwater, kalium-, magnesiumzout]"
3,1972-75,"[aardgas, propaangas, poeder, Noordzeegas, methaangas, olieproducten, teerzand, koolzuur, leisteen, vinylchloride]"
4,1976-79,"[aardgas, propaangas, gasveld, propaan, formaldehydegas, gassen, stoom, spul, gat, warmte]"
5,1980-83,"[aardgas, Gas, gas-, petroleumgas, methaangas, wanbetaling, gasveld, afvalgassen, aardgas-, dampen]"
6,1984-87,"[aardgas, boorgruis, aas, drinkwater, gassen, helium, gifgas, biogas, gas-, methaangas]"
7,1988-91,
8,1992-95,


## Similar words - aardolie

In [82]:
pd.set_option('display.max_colwidth', 255)
d = {'years':['1960-63','1964-67','1968-71','1972-75','1976-79','1980-83','1984-87','1988-91','1992-95'],
     'Similar words - aardolie':[sim_word_aardolie_63,sim_word_aardolie_67,sim_word_aardolie_71,sim_word_aardolie_75,sim_word_aardolie_79,sim_word_aardolie_83,sim_word_aardolie_87, None, None]}
                      #,sim_word_gas_91,sim_word_gas_95]}
pd.DataFrame(data=d)

Unnamed: 0,years,Similar words - aardolie
0,1960-63,"[olie, bruinkool, delfstoffen, olleprodukten, verscheping, oliebronnen, waterstof, bauxiet, afzet, grondstof]"
1,1964-67,"[olie, distributie, brutoproduktie, olie-, goedkoop, koolwaterstoffen, bruinkool, voedingsmiddelen, zwavel, Noordzeebodem]"
2,1968-71,"[olie, grondstoffen, stookolie, granen, Brunei, ertsen, staal, olie-aanvoer, olie-, vervaardiging]"
3,1972-75,"[olie, synthetisch, bruinkool, gasolle, West-Europa, lood, stookolie, West-Duitsland, oliereserves, eindprodukten]"
4,1976-79,"[olie, aardolie-, vliegtuigen, ertsen, methaan, steenkoolgas, grondstoffen, stookolie, olie-, energiedragers]"
5,1980-83,"[olie, stookolie, kapitaalgoederen, vloeistoffen, grondstof, produkten, kolengas, bruinkool, gasvelden, ex-dlvidend]"
6,1984-87,"[olie, olie-, edelmetalen, stookolie, veevoeders, residu, olieprodukten, energieprodukten, ijzererts, ertsen]"
7,1988-91,
8,1992-95,


## Similar words - steenkool

In [83]:
pd.set_option('display.max_colwidth', 255)
d = {'years':['1960-63','1964-67','1968-71','1972-75','1976-79','1980-83','1984-87','1988-91','1992-95'],
     'Similar words - steenkool':[sim_word_steenkool_63,sim_word_steenkool_67,sim_word_steenkool_71,sim_word_steenkool_75,sim_word_steenkool_79,sim_word_steenkool_83,sim_word_steenkool_87, None, None]}
                      #,sim_word_gas_91,sim_word_gas_95]}
pd.DataFrame(data=d)

Unnamed: 0,years,Similar words - steenkool
0,1960-63,"[kolen, steenkolen, anthraciet, huisbrandkolen, ruwe, draagvermogen, Sovjet-Unie, kernenergie, ijzererts, stikstof]"
1,1964-67,"[kolen, 32.000, cokes, industriekolen, huisbrandkolen, Emma, steenkolen, huisbrand, mijnindustrie, Groot-Brittannië]"
2,1968-71,"[kolen, cokes, steenkolen, zwavel, aardgas, import, koolstof, benzine, huisbrand, zwavelarme]"
3,1972-75,"[kolen, bruinkool, mineralen, steenkolen, leisteen, teerzand, elektriciteit, delfstoffen, aardgas, cokes]"
4,1976-79,"[kolen, steenkolen, teerzand, steenkoollagen, aardwarmte, bruinkool, mijnen, Duinkerken, zonneenergie, aardolie]"
5,1980-83,"[kolen, steenkolen, teerzand, bruinkool, methanol, steen-, fosfaten, olie-equivalent, ethanol, cadmium]"
6,1984-87,"[kolen, steenkolen, bruinkool, mica, delfstoffen, mineralen, landbouwprodukten, platina, Envoy, industrieprodukten]"
7,1988-91,
8,1992-95,
