In [4]:
import nltk
import re
from gensim.models import Word2Vec
from nltk.corpus import stopwords



In [5]:
paragraph = """I have three visions for India. In 3000 years of our history, people from all over 
               the world have come and invaded us, captured our lands, conquered our minds. 
               From Alexander onwards, the Greeks, the Turks, the Moguls, the Portuguese, the British,
               the French, the Dutch, all of them came and looted us, took over what was ours. 
               Yet we have not done this to any other nation. We have not conquered anyone. 
               We have not grabbed their land, their culture, 
               their history and tried to enforce our way of life on them. 
               Why? Because we respect the freedom of others.That is why my 
               first vision is that of freedom. I believe that India got its first vision of 
               this in 1857, when we started the War of Independence. It is this freedom that
               we must protect and nurture and build on. If we are not free, no one will respect us.
               My second vision for India’s development. For fifty years we have been a developing nation.
               It is time we see ourselves as a developed nation. We are among the top 5 nations of the world
               in terms of GDP. We have a 10 percent growth rate in most areas. Our poverty levels are falling.
               Our achievements are being globally recognised today. Yet we lack the self-confidence to
               see ourselves as a developed nation, self-reliant and self-assured. Isn’t this incorrect?
               I have a third vision. India must stand up to the world. Because I believe that unless India 
               stands up to the world, no one will respect us. Only strength respects strength. We must be 
               strong not only as a military power but also as an economic power. Both must go hand-in-hand. 
               My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of 
               space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material.
               I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. 
               I see four milestones in my career"""


In [7]:
# Preprocessing the data
text = re.sub(r'\[[0-9]*\]',' ',paragraph)
text = re.sub(r'\s+',' ',text) # removing spaces
text = text.lower()
text = re.sub(r'\d',' ',text)  # additional spaces

# Preparing the dataset
sentences = nltk.sent_tokenize(text)

sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

for i in range(len(sentences)):
    sentences[i] = [word for word in sentences[i] if word not in stopwords.words('english')]
    

In [9]:
# Training the Word2Vec model
model = Word2Vec(sentences, min_count=1)  # if word is less than 1 skip that word
model

<gensim.models.word2vec.Word2Vec at 0x270d6c6898>

In [10]:
words = model.wv.vocab
words


{'three': <gensim.models.keyedvectors.Vocab at 0x270d6c6278>,
 'visions': <gensim.models.keyedvectors.Vocab at 0x270d6c6860>,
 'india': <gensim.models.keyedvectors.Vocab at 0x270d6c6400>,
 '.': <gensim.models.keyedvectors.Vocab at 0x270d6c60f0>,
 'years': <gensim.models.keyedvectors.Vocab at 0x270d5c8dd8>,
 'history': <gensim.models.keyedvectors.Vocab at 0x270dbecfd0>,
 ',': <gensim.models.keyedvectors.Vocab at 0x270dbecda0>,
 'people': <gensim.models.keyedvectors.Vocab at 0x270dbec208>,
 'world': <gensim.models.keyedvectors.Vocab at 0x270dbec3c8>,
 'come': <gensim.models.keyedvectors.Vocab at 0x270dbec240>,
 'invaded': <gensim.models.keyedvectors.Vocab at 0x270dbec4a8>,
 'us': <gensim.models.keyedvectors.Vocab at 0x270dbec2b0>,
 'captured': <gensim.models.keyedvectors.Vocab at 0x270dbec780>,
 'lands': <gensim.models.keyedvectors.Vocab at 0x270dbec5c0>,
 'conquered': <gensim.models.keyedvectors.Vocab at 0x270dbecc18>,
 'minds': <gensim.models.keyedvectors.Vocab at 0x270dbece80>,
 'alex

In [12]:
# Finding Word Vectors
vector = model.wv['war']
vector

array([ 4.0796548e-03,  3.3689907e-03,  6.2614155e-04,  4.7819465e-03,
       -4.3614088e-03, -1.3607506e-03,  1.3178831e-03, -1.2445884e-03,
        1.2731553e-03, -3.7969893e-03, -4.7469665e-03,  3.5613270e-03,
        1.5201436e-03, -4.8266258e-03,  3.2963965e-03,  4.2620515e-03,
        4.7766049e-03,  2.9250570e-03,  4.9136770e-03, -1.8038802e-03,
        1.3287945e-03,  2.4229074e-03, -6.2579161e-04,  1.6861714e-03,
       -2.2328785e-03, -4.3921773e-03,  2.3693778e-04, -3.6165738e-03,
       -2.3491867e-04,  2.5476564e-03, -4.1029355e-03, -2.1957012e-03,
        1.3147179e-03, -2.1152403e-03,  3.0593185e-03,  3.3869455e-03,
       -3.4622161e-04,  4.6419953e-03, -2.9825470e-03,  4.3899110e-03,
       -4.7942204e-03, -4.6123634e-03, -2.9442890e-03, -4.3702414e-03,
        3.6025622e-03,  2.8390175e-04,  2.4612271e-03, -3.8307675e-03,
       -5.9921014e-05, -3.1931042e-05, -6.0518208e-04,  3.5760866e-03,
        2.9377290e-03, -2.9530574e-03, -1.3185024e-03,  1.5605586e-03,
      

In [14]:

# Most similar words
similar = model.wv.most_similar('war')
similar


[('turks', 0.23862268030643463),
 ('others.that', 0.23347406089305878),
 ('years', 0.20006385445594788),
 ('unless', 0.19962386786937714),
 ('fortune', 0.17946337163448334),
 ('self-reliant', 0.1764136552810669),
 ('dept', 0.17581623792648315),
 ('falling', 0.17548729479312897),
 ('’', 0.16740071773529053),
 ('four', 0.15844839811325073)]

In [16]:
# Most similar words
similar = model.wv.most_similar('vikram')
similar

[('incorrect', 0.3298473656177521),
 ('achievements', 0.20142939686775208),
 ('recognised', 0.1975674331188202),
 ('life', 0.1881031095981598),
 ('milestones', 0.18523181974887848),
 ('respect', 0.18484282493591309),
 ('nations', 0.16792313754558563),
 ('prakash', 0.1463736891746521),
 ('years', 0.14416201412677765),
 ('build', 0.1440231204032898)]

# check with Job description

In [24]:
paragraph="""Job Summary
Minimum Qualifications

Embedded: Experience with embedded Linux, embedded programming, microcontrollers, peripherals and its basic electronics. Proficient in communication protocol like i2c, spi, uart, bluetooth LE, WiFi and server protocol like Http, Https, Mqtt from embedded devices.
Electronics: Experience with testing and debugging embedded electronics. Operating knowledge of standard electronic test equipment is nice to have.
Languages: Experience of coding Python required. Additional familiarity with C, Node.js, Javascript/TypeScript beneficial.
APIs: Experience working with Rest APIs and request/response or client/server model.
Cloud: Experience with IoT and backend application development on one or more of public cloud platforms such as AWS, Azure or Google Cloud Platform.
Education: Bachelor's degree in a technical or engineering field, or equivalent practical experience.
Tools: Experience with version control (Git) and agile processes.
Responsibilities

Help architect, design, build and scale optimized and performant QikPod device embedded software components.
Implement IoT device-to-cloud communications using MQTT / HTTP / REST protocols.
Keywords

Python, C, Embedded Software, IoT, Embedded, Firmware, WiFi, MQTT, HTTP, sensors, MCU, BLE, I2C, SPI, UART.
Expectations

Write clean, well-designed code.
Produce detailed specifications.
Build reusable code and libraries for future use.
Troubleshoot, test and maintain the software and databases to ensure strong optimization and functionality.
Keep security and reliability as a design principle rather than afterthought.
Build reusable code and libraries for future use.
Optimize for speed, scalability and performance.
About the role
QikPod product team is building next-generation smart logistics IoT devices, cloud services & mobile apps. Our goal is to make the last mile of E-Commerce logistics simpler, safer, and speedier. We are backed by top-tier investors and global strategic partners. Our work is challenging and ambitious, which makes for a fun rich learning experience. We will expect you to exhibit technical expertise in managing individual project priorities, deadlines and deliverables. We value teamwork, initiative, problem solving skills, high integrity and strong work ethic, self-motivation and good time management skills across our team.
Job Perks

Competitive salary based on experience, contribution and/or skill-set.
12 Paid holidays plus 27 days of Sick/Casual/Earned Leaves.
Provident fund (PF).
Health Insurance for you, your spouse & children.
Macbook laptop and optional monitor.
Lunch service and pantry with hot/cold beverages and snacks.
Personal development and professional growth.
Balanced predictable work hours and well connected location.
Opportunity to develop innovations for last mile of E-Commerce.
Job Type: Full-time

Benefits:

Health insurance
Provident fund (PF)
Paid leaves / Leave encashment
Industry:

Software Development"""

In [25]:
# Preprocessing the data
text = re.sub(r'\[[0-9]*\]',' ',paragraph)
text = re.sub(r'\s+',' ',text) # removing spaces
text = text.lower()
text = re.sub(r'\d',' ',text)  # additional spaces

# Preparing the dataset
sentences = nltk.sent_tokenize(text)

sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

for i in range(len(sentences)):
    sentences[i] = [word for word in sentences[i] if word not in stopwords.words('english')]

In [26]:
# Training the Word2Vec model
model = Word2Vec(sentences, min_count=1)  # if word is less than 1 skip that word
model


<gensim.models.word2vec.Word2Vec at 0x270d7037f0>

In [27]:
words = model.wv.vocab
words


{'job': <gensim.models.keyedvectors.Vocab at 0x270d7039b0>,
 'summary': <gensim.models.keyedvectors.Vocab at 0x270d703908>,
 'minimum': <gensim.models.keyedvectors.Vocab at 0x270d7039e8>,
 'qualifications': <gensim.models.keyedvectors.Vocab at 0x270d703a58>,
 'embedded': <gensim.models.keyedvectors.Vocab at 0x270d703198>,
 ':': <gensim.models.keyedvectors.Vocab at 0x270d703358>,
 'experience': <gensim.models.keyedvectors.Vocab at 0x270d7033c8>,
 'linux': <gensim.models.keyedvectors.Vocab at 0x270d703a90>,
 ',': <gensim.models.keyedvectors.Vocab at 0x270d703240>,
 'programming': <gensim.models.keyedvectors.Vocab at 0x270d703278>,
 'microcontrollers': <gensim.models.keyedvectors.Vocab at 0x270d703a20>,
 'peripherals': <gensim.models.keyedvectors.Vocab at 0x270d703ac8>,
 'basic': <gensim.models.keyedvectors.Vocab at 0x270d703390>,
 'electronics': <gensim.models.keyedvectors.Vocab at 0x270d703b70>,
 '.': <gensim.models.keyedvectors.Vocab at 0x270d703b00>,
 'proficient': <gensim.models.keye

In [29]:

# Most similar words
similar = model.wv.most_similar('http')
similar

[('problem', 0.23438017070293427),
 ('c', 0.22801029682159424),
 ('embedded', 0.2129822075366974),
 ('tools', 0.21278229355812073),
 ("'s", 0.21017825603485107),
 ('design', 0.19414377212524414),
 ('communication', 0.18965786695480347),
 ('minimum', 0.18815483152866364),
 ('specifications', 0.18297871947288513),
 ('server', 0.1784271001815796)]