In [1]:
from nltk.corpus import stopwords
from time import time
import os
import numpy as np
from collections import OrderedDict
import math

In [2]:
def timeit(method):
    def timed(*args, **kw):
        ts = time()
        result = method(*args, **kw)
        te = time()

        print( '%r (%r, %r) %2.2f sec' % (method.__name__, args, kw, te-ts) )
        return result
    return timed

In [3]:
from gensim.models import KeyedVectors
if not os.path.exists('GoogleNews-vectors-negative300.bin.gz'):
    raise ValueError("SKIP: You need to download the google news model")
start = time()
print("Loading Word Vectors...")
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit=500000)
print('\nWord2vec model took %.2f seconds to load' % (time() - start))
stop_words = stopwords.words('english')

Loading Word Vectors...

Word2vec model took 29.75 seconds to load
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',

In [4]:
len(stop_words)

179

In [11]:
with open('capitals_processed_final.csv') as input_file:
    capital_of = OrderedDict()
    for line in input_file:
        if line.strip():
            print(line)
            try:
                country, capital = line.split(",")
                country = country.strip()
                capital = capital.strip()
                if(country != capital):
                    print(country, capital, "added")
                    capital_of[country] = capital
                    # start = time()
                    # print('Query time %.2f' % (time() - start))
                else:
                    print("Country and capital do no have separate vector")
                
            except ValueError:
                print('Skipping {}'.format(line))
            # print("*" * 100)
    # vectors_tuple = tuple(vectors)
    # final_matrix = np.concatenate(vectors_tuple)
    # print(vectors[3].shape)
    # print(vectors[3])
    # print(type(final_matrix))
    # print(final_matrix.shape)
    # print(final_matrix)

# 300 * 300 plane of basis vectors

Afghanistan, Kabul

Afghanistan Kabul added
Albania,Tirana

Albania Tirana added
Algeria,Algiers

Algeria Algiers added
Andorra,Andorra la Vella

Andorra Andorra la Vella added
Angola, Luanda

Angola Luanda added
Antigua and Barbuda, Saint John's

Antigua and Barbuda Saint John's added
Argentina, Buenos Aires

Argentina Buenos Aires added
Armenia, Yerevan

Armenia Yerevan added
Australia, Canberra

Australia Canberra added
Austria, Vienna

Austria Vienna added
Azerbaijan, Baku

Azerbaijan Baku added
The Bahamas, Nassau

The Bahamas Nassau added
Bahrain, Manama

Bahrain Manama added
Bangladesh, Dhaka

Bangladesh Dhaka added
Barbados, Bridgetown

Barbados Bridgetown added
Belarus, Minsk

Belarus Minsk added
Belgium, Brussels

Belgium Brussels added
Belize, Belmopan

Belize Belmopan added
Benin, Porto,Novo

Skipping Benin, Porto,Novo

Bhutan, Thimphu

Bhutan Thimphu added
Bolivia, La Paz (administrative); Sucre (judicial)

Bolivia La Paz (administrative); Sucre (judicial) added
Bosnia and

In [12]:
def train_new(capital_of, test_size):
    vectors = []
    for country, capital in list(capital_of.items())[:-test_size]:
        try:
            difference_vector = model.wv[country] - model.wv[capital]
            vectors.append(difference_vector)
        except KeyError:
                print('Vector unavailable for training data')
    train_matrix = np.array(vectors)
    m, n = train_matrix.shape
    print("M",m)
    print("N",n)
    X = []
    P = []
    for i in range(300):
        # Split 300-D vectors into 299-D a and 1-D b
        a = np.array(tuple(train_matrix[:,j] for j in range(300) if j != i) ).transpose()# 299 columns
        b = train_matrix[:,i] # i th column
        # print(train_matrix.shape)
        # print(a.shape)
        # print(b.shape)
        # print(i)
        a_transpose = a.transpose()
        # print(a_transpose.shape)
        # print(a_transpose.dot(a).shape)
        # print(np.linalg.inv(a_transpose.dot(a)).shape)
        # print(np.linalg.inv(a_transpose.dot(a)).dot(a_transpose).shape)
        # Find 299x1 dimensional vector x representing our semantic hyperplane
        x = np.linalg.inv(a_transpose.dot(a)).dot(a_transpose).dot(b)
        # find projection p of vector b on this hyperplane 
        p = a.dot(x)
        X.append(x)
        P.append(p)
    return X, P
    
    

In [13]:
    
def project(vector, plane):
    plane_transpose = plane.transpose()
    x = np.linalg.inv(plane_transpose.dot(plane)).dot(plane_transpose).dot(vector)
    p = plane.dot(x)
    return p

In [14]:
def predict_relationship_new(country, capital, X, cutoff):
    try:
        difference_vector = model.wv[country] - model.wv[capital]
        print("*" * 100)
        print(country, ",", capital)
        differences = []
        for i in range(300):
            # Split 300-D vectors into 299-D a and 1-D b
            actual_output = np.array(tuple(difference_vector[j] for j in range(300) if j != i) ).dot(X[i]) # 299 columns
            expected_output = difference_vector[i] # i th column
            difference_in_output = expected_output-actual_output
            differences.append(difference_in_output)
            # print("Actual output:", actual_output)
            # print("Expected output:", expected_output)
            # print("Difference:", difference_in_output) 
        # print(differences)
        differences_vector_norm = np.linalg.norm(np.array(differences))
        print(differences_vector_norm)
        if(differences_vector_norm < cutoff):
            print("Capital_of relationship identified")
            return True
        else:
            print("Not Capital_of relationship")
            return False
        print("*" * 100)
    except KeyError:
        print('Vector unavailable for testing data')
        return None

In [15]:
test_size = 20
cutoff = 750
X, P = train_new(capital_of, test_size)
# print(X)
# print(P)

  """


Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
Vector unavailable for training data
V

In [16]:
print("*" * 200)
print("Testing train data")
train_prediction_true = 0
train_prediction_false = 0
total = 0
for country, capital in list(capital_of.items())[:-test_size]:
    prediction = predict_relationship_new(country, capital, X, cutoff)
    if prediction == True:
        train_prediction_true += 1
    elif prediction == False:
        train_prediction_false += 1
total += (train_prediction_true + train_prediction_false)
print("*" * 200)

print("\n\n\n")

********************************************************************************************************************************************************************************************************
Testing train data
****************************************************************************************************
Afghanistan , Kabul
238.81683
Capital_of relationship identified
****************************************************************************************************
Albania , Tirana
503.0289
Capital_of relationship identified
****************************************************************************************************
Algeria , Algiers
485.14056
Capital_of relationship identified
Vector unavailable for testing data
****************************************************************************************************
Angola , Luanda
386.162
Capital_of relationship identified
Vector unavailable for testing data
Vector unavailable for testing data
********************

  This is separate from the ipykernel package so we can avoid doing imports until


257.72842
Capital_of relationship identified
****************************************************************************************************
Azerbaijan , Baku
107.51273
Capital_of relationship identified
Vector unavailable for testing data
****************************************************************************************************
Bahrain , Manama
178.33926
Capital_of relationship identified
****************************************************************************************************
Bangladesh , Dhaka
228.0434
Capital_of relationship identified
****************************************************************************************************
Barbados , Bridgetown
160.25189
Capital_of relationship identified
****************************************************************************************************
Belarus , Minsk
279.10248
Capital_of relationship identified
**************************************************************************************************

244.56248
Capital_of relationship identified
****************************************************************************************************
Kazakhstan , Astana
171.29227
Capital_of relationship identified
****************************************************************************************************
Kenya , Nairobi
105.63209
Capital_of relationship identified
Vector unavailable for testing data
****************************************************************************************************
Kosovo , Pristina
141.11726
Capital_of relationship identified
Vector unavailable for testing data
****************************************************************************************************
Kyrgyzstan , Bishkek
210.94543
Capital_of relationship identified
****************************************************************************************************
Laos , Vientiane
438.18973
Capital_of relationship identified
*************************************************************

184.33342
Capital_of relationship identified
Vector unavailable for testing data
Vector unavailable for testing data
****************************************************************************************************
Spain , Madrid
212.97241
Capital_of relationship identified
Vector unavailable for testing data
****************************************************************************************************
Sudan , Khartoum
223.96889
Capital_of relationship identified
****************************************************************************************************
Suriname , Paramaribo
191.57336
Capital_of relationship identified
****************************************************************************************************
Swaziland , Mbabane
232.25655
Capital_of relationship identified
****************************************************************************************************
Sweden , Stockholm
242.2636
Capital_of relationship identified
*************************

In [None]:
print("*" * 200)
print("Testing test data")
test_prediction_true = 0
test_prediction_false = 0

for country, capital in list(capital_of.items())[-test_size:]:
    prediction = predict_relationship_new(country, capital, X, cutoff)
    if prediction == True:
        test_prediction_true += 1
    elif prediction == False:
        test_prediction_false += 1
total += (test_prediction_true + test_prediction_false)
print("*" * 200)
print("\n\n\n")

In [17]:
print("*" * 200)
print("Train data positive:", train_prediction_true)
print("Train data negative:", train_prediction_false)
accuracy = train_prediction_true / (train_prediction_false + train_prediction_true)
print("Train data positive accuracy:", accuracy, accuracy * 100, "%")
print("*" * 200)

print("\n\n\n")

print("\n\n\n")

print("*" * 200)
print("Test data positive:", test_prediction_true)
print("Test data negative:", test_prediction_false)
accuracy = test_prediction_true / (test_prediction_false + test_prediction_true)
print("Test data positive accuracy:", accuracy, accuracy * 100, "%") 
print("*" * 200)

print("\n\n\n")

********************************************************************************************************************************************************************************************************
Train data positive: 110
Train data negative: 3
Train data positive accuracy: 0.9734513274336283 97.34513274336283 %
********************************************************************************************************************************************************************************************************








********************************************************************************************************************************************************************************************************


NameError: name 'test_prediction_true' is not defined