In [1]:
#Basic dependencies
import codecs, glob, os, re, multiprocessing
import numpy as np
import gensim.models.atmodel



In [2]:
#External libraries
import nltk                           #For generating corpus
import gensim.models.word2vec as w2v  #To generate vectors from word embeddings
import sklearn.manifold               #To reduce the dimensions 
import matplotlib.pyplot as plt       #For visual representation
import pandas as pd                   #To manipulate data
import seaborn as sns                 #To Plot graphs


In [4]:
nltk.download("punkt")      #pretrained tokenizer
nltk.download("stopwords")  #words that are insignificant for training the model
nltk.download("genesis")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shrav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package genesis to
[nltk_data]     C:\Users\shrav\AppData\Roaming\nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\shrav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
raw_corpus = u""

subjectname = input("Subject name: ")
textfile = sorted(glob.glob("Resources\\" + subjectname + "\\" + "\*.txt"))

for text in textfile:
    print(f"Reading '{text}'")
    
    with codecs.open(text, "r", "utf-8") as file:
        raw_corpus += file.read()


Subject name: Machine Learning
Reading 'Resources\Machine Learning\machinelearning.txt'


In [14]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(raw_corpus)


In [15]:
cachedStopWords =  nltk.corpus.stopwords.words("english")

In [16]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]", " ", raw)
    words = clean.split()
    stemmer = nltk.PorterStemmer()
    
    filtered_sentence = [w for w in words if not w in cachedStopWords]
    return filtered_sentence

In [17]:
sentences = []

for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))


The book corpus contains 3,797 tokens


In [18]:
# 3 main properties that vectors help us with
# DISTANCE, SIMILARITY, RANKING
num_features = 300
min_word_count = 3
num_workers = multiprocessing.cpu_count()
context_size = 7
downsampling = 1e-3
seed = 1

In [19]:
learningmodel = w2v.Word2Vec(
    sg = 1,
    seed = seed,
    workers = num_workers,
    size = num_features,
    min_count = min_word_count,
    window = context_size,
    sample = downsampling
    )

learningmodel.build_vocab(sentences)

In [21]:
print("Vocabulary length: ", len(learningmodel.wv.vocab))

#Training our model
learningmodel.train(sentences, total_examples = token_count, epochs= 25)

filename = str(input("Subject : "))

if not os.path.exists("Training Data\\" + filename):
    location = "Training Data\\" + filename
    os.makedirs(location)

learningmodel.save(os.path.join(location, filename+".w2v"))



Vocabulary length:  318
Subject : Myapp


In [27]:
learningmodel.wv.most_similar(positive=["Learning"])


[('Intelligence', 0.9951659440994263),
 ('source', 0.9925799369812012),
 ('Machine', 0.9925317764282227),
 ('software', 0.991557776927948),
 ('open', 0.9913204312324524),
 ('Research', 0.9864009022712708),
 ('Google', 0.9616549611091614),
 ('nMachine', 0.9323292970657349),
 ('variety', 0.924755334854126),
 ('A', 0.917307436466217)]