# **BoW**

In [1]:
sample = ['problem of evil',
          'evil queen',
          'horizon problem']

In [5]:
#For a vectorization of this data based on word count (BoW), we could construct a column 
#representing the word "problem," the word "evil," the word "horizon," and so on.

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

vec = CountVectorizer()
X = vec.fit_transform(sample)
#print(X)
#print("shape:",X.shape)
#pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

df = pd.DataFrame()
df['vocabulary'] = vec.get_feature_names()
df['document1 vector'] = X.toarray()[0]
df['document2 vector'] = X.toarray()[1]
df['document3 vector'] = X.toarray()[2]
df.set_index('vocabulary', inplace=True)
print(df.T)

#The result is a sparse matrix recording the number of times each word appears; 
#it is easier to inspect if we convert this to a DataFrame with labeled columns:

vocabulary        evil  horizon  of  problem  queen
document1 vector     1        0   1        1      0
document2 vector     1        0   0        0      1
document3 vector     0        1   0        1      0


# **N-grams**

In [7]:
#n-grams is a sequence of N-words in a sentence. Captures the context in which the words are used together (e.g. San Fransisco).  sparce.
#The bag of words is sparce and does not take into consideration the order of the words in which they appear in a document, and only individual words are counted.

from nltk import ngrams #nltk ia a suite of libraries and programs for symbolic and statistical natural language processing

for sentence in sample:
  ngram = ngrams(sentence.split(' '), n=2)
  for x in ngram:
    print(x)

#OR
vectorizer = CountVectorizer(ngram_range=(2,2))
# The ngram range specifies your ngram configuration.
X = vectorizer.fit_transform(sample)
# Testing the ngram generation:
print(vectorizer.get_feature_names())
print(X.toarray())

('problem', 'of')
('of', 'evil')
('evil', 'queen')
('horizon', 'problem')
['evil queen', 'horizon problem', 'of evil', 'problem of']
[[0 0 1 1]
 [1 0 0 0]
 [0 1 0 0]]


# **TF-IDF**

In [None]:
#There are some issues with BoW, however: the raw word counts lead to features which put too much weight on 
#words that appear very frequently, and this can be sub-optimal in some classification algorithms. One approach to 
#fix this is known as term frequency-inverse document frequency (TF–IDF) which weights the word counts by a measure of  how often they 
#appear in the documents. The syntax for computing these features is similar to the previous example:

from sklearn.feature_extraction.text import TfidfVectorizer
def createtfidf(sample):
  vec = TfidfVectorizer()# or vec = TfidfVectorizer(ngram_range=(2,2), norm=None) # You can still specify n-grams here.
  X = vec.fit_transform(sample) 
  return pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

createtfidf(sample)

#Detailed
#from sklearn.feature_extraction.text import TfidfVectorizer
#vec = TfidfVectorizer()
#vec.fit(sample)
#X = vec.transform(sample)
#pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Unnamed: 0,evil,horizon,of,problem,queen
0,0.517856,0.0,0.680919,0.517856,0.0
1,0.605349,0.0,0.0,0.0,0.795961
2,0.0,0.795961,0.0,0.605349,0.0


In [None]:
#We’ll change the first message from 'problem of evil' to 'problem problem problem of evil'. 
#We should expect the term frequency for 'problem' to increase and therefore the TF-IDF value for problem in the first message to increase.

sample2 = ['problem problem problem of evil',
          'evil queen',
          'horizon problem']
createtfidf(sample2)

#the values for other words in the first message have decreased. Their term frequency has decreased as there are now more words in the message 
#so the TF-IDF will decrease as well. Also, the value for 'problem' in the second message is unchanged from our first example. 
#This is because we haven’t done anything to change the IDF portion of the TF-IDF. Both examples contain 'problem' in both messages.

Unnamed: 0,evil,horizon,of,problem,queen
0,0.291992,0.0,0.383935,0.875976,0.0
1,0.605349,0.0,0.0,0.0,0.795961
2,0.0,0.795961,0.0,0.605349,0.0


In [None]:
#lets try manipulate the messages to change the IDF portion of the TF-IDF.We’ll change our second message from 'horizon problem' 
#to "horizon'. Now the word 'problem' only occurs in one message so we should expect its value to increase as its IDF value is increasing.

sample3 = ['problem problem problem of evil',
          'evil queen',
          'horizon']
createtfidf(sample3)

Unnamed: 0,evil,horizon,of,problem,queen
0,0.233832,0.0,0.307461,0.922383,0.0
1,0.605349,0.0,0.0,0.0,0.795961
2,0.0,1.0,0.0,0.0,0.0


**Note:** TfidfVectorizer is the equivalent of CountVectorizer followed by TfidfTransformer

# **Word2Vec**

In [8]:
sample = ['problem of evil',
          'evil queen',
          'horizon problem']

In [9]:
import nltk
import gensim
nltk.download('punkt')
from gensim.models import Word2Vec

all_words = [nltk.word_tokenize(sent) for sent in sample]

word2vec= Word2Vec (all_words, min_count=2) #include only words in the Word2Vec model that appear at least twice in the corpus.
#model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=2)
#size: the number of dimensions in the vector space. If the value is 100, then the meaning representation of each word will be a 100-dimensional vector.
#window: the number of neighboring words (on each side) considered when building the model.
#min_count: words whose frequency is less than this value are ignored.
#workers: the number of parallel processes used during the training process. You can speed up the training by setting this to a larger value if you have more than 2 CPU cores on your machine.

vocabulary= word2vec.wv.vocab #The model's vocabulary is a dictionary, with the keys being each token (word). 

print(vocabulary)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
{'problem': <gensim.models.keyedvectors.Vocab object at 0x7f20e7849748>, 'evil': <gensim.models.keyedvectors.Vocab object at 0x7f20e78497b8>}


In [10]:
#Word2Vec model converts words to their corresponding vectors. Let's see how we can view vector representation of a word..
v1 = word2vec.wv['problem']
v1

array([ 4.3532243e-03,  5.4906227e-04,  1.6951429e-03,  1.8797563e-03,
        2.0063501e-03,  2.0611950e-03, -7.4272836e-04, -3.5881069e-03,
       -2.5332409e-03, -4.6822149e-03,  2.4555156e-03, -1.2602224e-03,
       -4.5176959e-03,  4.9101971e-03,  1.1482899e-03, -4.9421499e-03,
        3.9329231e-03,  4.0613432e-04, -3.2957762e-03, -4.9075210e-03,
       -3.9906758e-03,  2.8049382e-03,  1.5063480e-03,  1.4618784e-03,
       -4.3085837e-03,  4.7939052e-03,  1.6772733e-03, -4.9810894e-03,
       -4.2805793e-03, -3.1284932e-03, -3.3929537e-03, -3.0437706e-03,
        4.1656545e-03, -9.7444368e-04,  4.9749026e-03,  1.0092796e-03,
        2.3884648e-03,  2.6779745e-03, -4.3290509e-03, -3.2960088e-03,
       -2.7309899e-04, -4.2646006e-03, -1.5908709e-03, -8.4851222e-04,
        4.5451554e-03,  4.6358756e-03,  4.4557885e-03, -2.0368285e-04,
       -3.8463605e-04,  2.4401334e-05,  4.7490420e-03, -3.9129648e-03,
       -4.5498712e-03,  5.2227082e-05, -4.8382184e-04,  3.9212620e-03,
      

In [11]:
sim_words = word2vec.wv.most_similar('problem')#('problem',topn=2) #find the nearest neighbors: the words that are most similar to the word you specified
sim_words

#('evil', 0.09049832820892334) -> (similar word, similarity index)

  if np.issubdtype(vec.dtype, np.int):


[('evil', 0.0488138273358345)]

In [12]:
wordsim = word2vec.wv.similarity('problem', 'evil') #computing the cosine similarity between the vectors associated with the words.
wordsim

  if np.issubdtype(vec.dtype, np.int):


0.04881383

In [19]:
dissimilar= word2vec.wv.doesnt_match("evil horizon".split())#prints the most dissimilar words from these words
dissimilar

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'evil'

In [13]:
#save model
with open("word2vecf.model", "w"):
  word2vec.save("word2vecf")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
#load model
model = gensim.models.Word2Vec.load("word2vecf")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
#download model
#from google.colab import files
#files.download("word2vecf.model")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>