In [1]:
import wikipedia
import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
question = "What is on the periodic table?"

matches = wikipedia.search(question)
matches

['Periodic table',
 'History of the periodic table',
 'Periodic Videos',
 'Extended periodic table',
 'Periodic table of topological invariants',
 'Dmitri Mendeleev',
 'Metalloid',
 'The Periodic Table (Basher book)',
 'Alkali metal',
 'Descriptive research']

In [3]:
# Loop through all of these documents for V1; another hypothesis for matching the one that best matches the query

In [4]:
[match for match in matches]

['Periodic table',
 'History of the periodic table',
 'Periodic Videos',
 'Extended periodic table',
 'Periodic table of topological invariants',
 'Dmitri Mendeleev',
 'Metalloid',
 'The Periodic Table (Basher book)',
 'Alkali metal',
 'Descriptive research']

In [5]:
import nltk
nltk.download('punkt')
import torch
from models import InferSent

# Initialize
infersent = InferSent({'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': 1})
infersent.load_state_dict(torch.load('/Users/petermyers/Desktop/Other/data/InferSent/encoder/infersent1.pkl'))
infersent.set_w2v_path('/Users/petermyers/Desktop/Other/data/GloVe/glove.840B.300d.txt')

# My sentences
sentences = [question] + matches
infersent.build_vocab(sentences, tokenize=True)
embeddings = infersent.encode(sentences, tokenize=True)
print(embeddings)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/petermyers/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Found 29(/29) words with w2v vectors
Vocab size : 29
[[0.08703331 0.08786308 0.05431972 ... 0.03837078 0.03438099 0.00756402]
 [0.00080437 0.01218406 0.0565544  ... 0.         0.02164815 0.        ]
 [0.         0.08619805 0.05026022 ... 0.0454581  0.02453266 0.01138138]
 ...
 [0.02356069 0.06758805 0.06377555 ... 0.00434644 0.02094805 0.0433331 ]
 [0.10077003 0.         0.16811676 ... 0.         0.         0.03339149]
 [0.         0.05960754 0.         ... 0.02698475 0.04557415 0.00756522]]


In [6]:


embeddings = np.array(embeddings)
embeddings = StandardScaler().fit_transform(embeddings)
embeddings = MinMaxScaler().fit_transform(embeddings)
distances = pdist(embeddings, metric='euclidean')
sentence_similarity_matrix = squareform(distances)

In [7]:
sentence_similarity_matrix[0]

array([ 0.        , 29.50440347, 27.3204302 , 34.25857614, 28.32720965,
       35.24618979, 37.05823118, 36.50455042, 35.14499066, 36.0809703 ,
       34.87906189])

In [8]:
best_match = np.argmin(sentence_similarity_matrix[0][1:])

In [9]:
# Find which sentence is most similar to the question
content_on_page = wikipedia.page(matches[best_match]).content
content_on_page

'The periodic table is an arrangement of the chemical elements, which are organized on the basis of their atomic numbers, electron configurations and recurring chemical properties. Elements are presented in order of increasing atomic number. The standard form of the table consists of a grid with rows called periods and columns called groups.\nThe history of the periodic table reflects over two centuries of growth in the understanding of chemical properties, with major contributions made by Antoine-Laurent de Lavoisier, Johann Wolfgang Döbereiner, John Newlands, Julius Lothar Meyer, Dmitri Mendeleev, and Glenn T. Seaborg.\n\n\n== Early history ==\n\nA number of physical elements (such as platinum, mercury, tin and zinc) have been known from antiquity, as they are found in their native form and are relatively simple to mine with primitive tools. Around 330 BCE, the Greek philosopher Aristotle proposed that everything is made up of a mixture of one or more roots, an idea that had original

In [10]:
content_on_page

'The periodic table is an arrangement of the chemical elements, which are organized on the basis of their atomic numbers, electron configurations and recurring chemical properties. Elements are presented in order of increasing atomic number. The standard form of the table consists of a grid with rows called periods and columns called groups.\nThe history of the periodic table reflects over two centuries of growth in the understanding of chemical properties, with major contributions made by Antoine-Laurent de Lavoisier, Johann Wolfgang Döbereiner, John Newlands, Julius Lothar Meyer, Dmitri Mendeleev, and Glenn T. Seaborg.\n\n\n== Early history ==\n\nA number of physical elements (such as platinum, mercury, tin and zinc) have been known from antiquity, as they are found in their native form and are relatively simple to mine with primitive tools. Around 330 BCE, the Greek philosopher Aristotle proposed that everything is made up of a mixture of one or more roots, an idea that had original

In [11]:
import re
from nltk.tokenize import word_tokenize
wnl = nltk.WordNetLemmatizer()
# Clean Sentences
doc = content_on_page
doc = doc.lower()
sents = nltk.sent_tokenize(doc)
processed_sents = []
for sent in sents:
    words = word_tokenize(sent)
    words = [re.sub(r'[^A-Za-z_\s]', '', w) for w in words]
    words = [wnl.lemmatize(w) for w in words if w.strip() != '']
    processed_sent = " ".join(words)
    processed_sents.append(processed_sent)

In [12]:
processed_sents

['the periodic table is an arrangement of the chemical element which are organized on the basis of their atomic number electron configuration and recurring chemical property',
 'element are presented in order of increasing atomic number',
 'the standard form of the table consists of a grid with row called period and column called group',
 'the history of the periodic table reflects over two century of growth in the understanding of chemical property with major contribution made by antoinelaurent de lavoisier johann wolfgang dbereiner john newlands julius lothar meyer dmitri mendeleev and glenn t seaborg',
 'early history a number of physical element such a platinum mercury tin and zinc have been known from antiquity a they are found in their native form and are relatively simple to mine with primitive tool',
 'around bce the greek philosopher aristotle proposed that everything is made up of a mixture of one or more root an idea that had originally been suggested by the sicilian philoso

In [13]:
# My sentences
sentences = [question] + processed_sents
infersent.build_vocab(sentences, tokenize=True)
embeddings = infersent.encode(sentences, tokenize=True)
print(embeddings)

Found 978(/1046) words with w2v vectors
Vocab size : 978
[[0.08703331 0.08786308 0.05431972 ... 0.03837078 0.03438099 0.00756401]
 [0.03369612 0.11327431 0.13488707 ... 0.15930222 0.         0.        ]
 [0.10306813 0.1157925  0.06483371 ... 0.07241702 0.         0.        ]
 ...
 [0.09141635 0.12155321 0.11983121 ... 0.14431895 0.02453266 0.1669683 ]
 [0.1004963  0.0947988  0.0748287  ... 0.07203805 0.06210614 0.        ]
 [0.04413427 0.07937205 0.11888101 ... 0.03655036 0.05185106 0.01387161]]


In [14]:


embeddings = np.array(embeddings)
embeddings = StandardScaler().fit_transform(embeddings)
embeddings = MinMaxScaler().fit_transform(embeddings)
distances = pdist(embeddings, metric='euclidean')
sentence_similarity_matrix = squareform(distances)

In [15]:
best_match = np.argmin(sentence_similarity_matrix[0][1:])
best_match

115

In [16]:
processed_sents[best_match]

'this determines the order in which electron shell are filled and explains the periodicity of the periodic table'

In [17]:
best_match = np.argsort(sentence_similarity_matrix[0][1:])
best_match

array([115,   2,  89,   8,  88,  62,   1,  28,  33,  70, 140, 113,  57,
       141,  20, 132,  43,  68,   0,  13,  27,  47, 133,  78,  87,  55,
        60,  38,  50, 103,  72,  67,  58,  61, 136,  85,  59,  14,  49,
        39, 117,  77,  31,  91, 121,  75,  41, 114, 106,  82, 123,  36,
        16,  69,  74, 135, 110,  19,  99,  63,  24,  37,  46,  81,  22,
        17, 128,  94,  96, 134,  83,  93,  12,  45, 107, 120, 105,   7,
         6, 116,  34, 108, 138,  76,  44,  95,  79, 131, 125,  10,  29,
       127,  30,  52,  71,  42,  48, 111,  56, 130,  86,  35, 118,   9,
        65,  97,  25,  51,  53,  80,  21, 139,   4, 119,  32,  11, 104,
        92,  23,  40, 126,  64,  90,  66,  98,  15, 124,   5, 102, 101,
       112,  73, 129,  18,  54,  84, 122, 137,  26,   3, 100, 109])

In [22]:
np.argsort(sentence_similarity_matrix[0][1:])[0:5]

array([115,   2,  89,   8,  88])