In [15]:
import nltk

from gensim.models import Word2Vec
from nltk.corpus import stopwords

import re

In [16]:
paragraph = """Bahrain Economic Vision 2030
The Economic Vision 2030 (PDF, 279KB, 26 pages) which was launched in October 2008 by His Majesty
King Hamad bin Isa Al Khalifa, is a comprehensive economic vision for Bahrain, providing a clear
direction for the continued development of the Kingdom’s economy and, at its heart, is a shared
goal of building a better life for every Bahraini.
The launch of the Vision followed four years of extensive discussions with a range of opinion
leaders in the public and private sectors, including governmental institutions and organizations,
as well as international consultancies and bodies. The Economic Vision 2030 focuses on shaping the
vision of the government, society, and the economy, based around three guiding principles;
sustainability, fairness, and competitiveness. Following the launch of the Economic Vision 2030,
the Economic Development Board (EDB)   initiated an on-going programme of economic and institutional
reform. The EDB led and coordinated with ministries to compile the first National Economic Strategy,
which served as a roadmap to achieve the Vision. The vision 2030 also addresses the Sustainable Development Goals 2030 (SDGs).
The guiding principles of the Economic Vision 2030 are sustainability, competitiveness, and fairness.
Sustainability
A considerable share of the country’s growth over the last two decades was driven by the public sector.
This model is running out of steam, as government finances become tighter and competition increases
in a global economy. By 2030, the private sector should be able to drive economic growth in Bahrain independently.
Bahrain's Vision sees the economic prosperity built on a firm foundation. Government finances will adhere to the
principle of sustainability, upholding a system that is stable and forward-looking. Bahrain will use its resources
to invest in the future, improving its human capital through education and training, particularly in the field of
applied sciences. In a world where modern technology and new competitors from across the globe are continually
shortening product lifetimes, entrepreneurship, and innovation will ensure the sustainability of a vibrant
private sector. But economic growth must never come at the expense of the environment and the long-term
well-being of Bahrainis. No effort will be spared to protect Bahrain's environment and preserve the kingdom's cultural heritage.
Competitiveness.Bahrain will attain a high level of competitiveness in a global economy.
Increased productivity comes about much more naturally in a competitive environment,
driving economic growth, profitability, and wages. Only high and continuously improving productivity
will enable businesses to increase their employees' wages. Higher productivity requires people with the
right skills for each position. To be Competitive; Bahrain will go to great lengths to educate their people,

retain qualified staff, and attract foreign workers with the skills that are lacking.
The key is to make Bahrain a great place to do business for both local and foreign companies.
Many factors combined to make a country attractive to investors in high-value-added industries:
a high-quality public service, a cutting-edge infrastructure, and an appealing living environment are among."""

In [17]:
# Preprocessing the data
text = re.sub(r'\[[0-9]*\]',' ',paragraph)
text = re.sub(r'\s+',' ',text)
text = text.lower()
text = re.sub(r'\d',' ',text)
text = re.sub(r'\s+',' ',text)

In [18]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
# Preparing the dataset
sentences = nltk.sent_tokenize(text)

sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

In [23]:
for i in range(len(sentences)):
    sentences[i] = [word for word in sentences[i] if word not in stopwords.words('english')]

In [24]:
sentences[1]

['launch',
 'vision',
 'followed',
 'four',
 'years',
 'extensive',
 'discussions',
 'range',
 'opinion',
 'leaders',
 'public',
 'private',
 'sectors',
 ',',
 'including',
 'governmental',
 'institutions',
 'organizations',
 ',',
 'well',
 'international',
 'consultancies',
 'bodies',
 '.']

In [21]:
# Training the Word2Vec model
model = Word2Vec(sentences, min_count=1)

In [26]:
# Finding Word Vectors
vector = model.wv['leaders']

In [27]:
vector

array([ 7.94349611e-03, -6.75515085e-03,  3.69084504e-04,  3.11816158e-03,
       -2.10095686e-03,  9.36615374e-03, -3.65719595e-03,  3.89396353e-03,
        7.44461734e-03,  4.11277730e-03, -2.67444551e-03, -7.93132652e-03,
       -5.75616164e-03,  1.52393116e-03, -3.69273964e-03, -7.12926360e-03,
        7.43424520e-03,  4.08686278e-03,  2.39234674e-03, -6.83239079e-04,
        4.27038502e-03, -8.87822174e-03, -5.62296342e-03, -5.71860094e-03,
        3.48924636e-03, -2.15970003e-03, -7.31835840e-03, -7.14294100e-03,
        7.01943692e-03, -3.40537052e-03, -3.62527277e-03,  1.00723254e-02,
       -9.31588933e-04, -5.60368551e-03,  2.45381310e-03,  1.58647273e-03,
       -6.52337493e-03, -1.00024985e-02,  2.24626041e-03,  8.84058885e-03,
       -5.63257886e-03,  6.76593417e-03, -3.68327979e-04, -8.41884874e-03,
        6.46076351e-03, -6.71195937e-03, -2.90053152e-03,  1.70476455e-03,
        3.26952781e-04, -6.74038986e-03, -5.68629149e-03, -3.62414028e-03,
        7.82093778e-03,  

In [29]:
# Most similar words
similar = model.wv.most_similar('governmental')

In [30]:
similar

[('extensive', 0.2659156620502472),
 (':', 0.24657799303531647),
 ('addresses', 0.2276732176542282),
 ('october', 0.21980170905590057),
 ('investors', 0.21536339819431305),
 ('vision', 0.205167755484581),
 ('national', 0.19210898876190186),
 ('naturally', 0.1836424320936203),
 ('great', 0.17985714972019196),
 ('principle', 0.1609119176864624)]

In [32]:
# Most similar words
similar = model.wv.most_similar('vision')

In [33]:
similar

[('innovation', 0.29643914103507996),
 ('staff', 0.27249768376350403),
 ('naturally', 0.25683531165122986),
 ('industries', 0.22195768356323242),
 ('able', 0.2165793776512146),
 ('governmental', 0.2051677405834198),
 ('steam', 0.18378257751464844),
 ('development', 0.17930229008197784),
 ('system', 0.16646789014339447),
 ('based', 0.1655966341495514)]