In [2]:
!pip install -U spacy
!pip install -U gensim

Collecting spacy
  Downloading spacy-3.8.11-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.15-cp313-cp313-win_amd64.whl.metadata (2.3 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.13-cp313-cp313-win_amd64.whl.metadata (9.9 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.12-cp313-cp313-win_amd64.whl.metadata (2.6 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.10-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.5.2-cp313-cp313-win_am

In [4]:
documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
processed_docs = [doc.lower().replace(".","") for doc in documents]
print(processed_docs)

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']


In [5]:
vocab = {}
count = 0
for doc in processed_docs:
    for word in doc.split():
        if word not in vocab:
            count = count + 1
            vocab[word] = count
print(vocab)

{'dog': 1, 'bites': 2, 'man': 3, 'eats': 4, 'meat': 5, 'food': 6}


In [7]:
def get_onehot_vector(somestring):
    onehot_encoded = []
    for word in somestring.split():
        temp = [0] * len(vocab)
        if word in vocab:
            temp[vocab[word] - 1] = 1
        onehot_encoded.append(temp)
    return onehot_encoded
print(get_onehot_vector(processed_docs[1]))

[[0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]]


In [12]:
# Step 2: Bag of words
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

#Build a BOW
bow_rep = count_vect.fit_transform(processed_docs)

#Vocab mapping
print("Our vocabulary: ", count_vect.vocabulary_)

print("BoW representation for 'dog bites man': ", bow_rep[0].toarray())
print("BoW representation for 'man bites dog': ", bow_rep[1].toarray())

#
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

Our vocabulary:  {'dog': 1, 'bites': 0, 'man': 4, 'eats': 2, 'meat': 5, 'food': 3}
BoW representation for 'dog bites man':  [[1 1 0 0 1 0]]
BoW representation for 'man bites dog':  [[1 1 0 0 1 0]]
Bow representation for 'dog and dog are friends': [[0 2 0 0 0 0]]


In [13]:
count_vect_bin = CountVectorizer(binary=True)
bow_rep_bin = count_vect_bin.fit_transform(processed_docs)
temp_bin = count_vect_bin.transform(["dog and dog are friends"])
print("Binary Bow for 'dog and dog are friends':", temp_bin.toarray())

count_vect_ngram = CountVectorizer(ngram_range=(1,3))
bow_rep_ngram = count_vect_ngram.fit_transform(processed_docs)
print("N-gram vocabulary: ", count_vect_ngram.vocabulary_)

Binary Bow for 'dog and dog are friends': [[0 1 0 0 0 0]]
N-gram vocabulary:  {'dog': 3, 'bites': 0, 'man': 12, 'dog bites': 4, 'bites man': 2, 'dog bites man': 5, 'man bites': 13, 'bites dog': 1, 'man bites dog': 14, 'eats': 8, 'meat': 17, 'dog eats': 6, 'eats meat': 10, 'dog eats meat': 7, 'food': 11, 'man eats': 15, 'eats food': 9, 'man eats food': 16}


In [16]:
#Step 3
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
bow_rep_tfidf = tfidf.fit_transform(processed_docs)
print("IDF values:", tfidf.idf_)
print("Feature names:", tfidf.get_feature_names_out())
# Transform sample texts
temp_tfidf = tfidf.transform(["dog and man are friends"])
print("Tfidf for 'dog and man are friends':\n", temp_tfidf.toarray())

IDF values: [1.51082562 1.22314355 1.51082562 1.91629073 1.22314355 1.91629073]
Feature names: ['bites' 'dog' 'eats' 'food' 'man' 'meat']
Tfidf for 'dog and man are friends':
 [[0.         0.70710678 0.         0.         0.70710678 0.        ]]


In [21]:
from gensim.models import Word2Vec, KeyedVectors
pretrainedpath = "./model/GoogleNews-vectors-negative300.bin"
w2v_model = KeyedVectors.load_word2vec_format(pretrainedpath,binary=True)
print("Vocabulary size:", len(w2v_model.key_to_index))
print("Similar to 'beautiful':", w2v_model.most_similar("beautiful"))
print("Vector for 'beautiful':", w2v_model['beautiful'])

Vocabulary size: 3000000
Similar to 'beautiful': [('gorgeous', 0.8353004455566406), ('lovely', 0.8106936812400818), ('stunningly_beautiful', 0.7329413294792175), ('breathtakingly_beautiful', 0.7231340408325195), ('wonderful', 0.6854087114334106), ('fabulous', 0.6700064539909363), ('loveliest', 0.6612576246261597), ('prettiest', 0.6595001816749573), ('beatiful', 0.6593326330184937), ('magnificent', 0.6591402888298035)]
Vector for 'beautiful': [-0.01831055  0.05566406 -0.01153564  0.07275391  0.15136719 -0.06176758
  0.20605469 -0.15332031 -0.05908203  0.22851562 -0.06445312 -0.22851562
 -0.09472656 -0.03344727  0.24707031  0.05541992 -0.00921631  0.1328125
 -0.15429688  0.08105469 -0.07373047  0.24316406  0.12353516 -0.09277344
  0.08203125  0.06494141  0.15722656  0.11279297 -0.0612793  -0.296875
 -0.13378906  0.234375    0.09765625  0.17773438  0.06689453 -0.27539062
  0.06445312 -0.13867188 -0.08886719  0.171875    0.07861328 -0.10058594
  0.23925781  0.03808594  0.18652344 -0.112792

In [23]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

our_model = Word2Vec(common_texts, vector_size=10, window=5, min_count=1, workers=4)

print("Similar to 'computer:", our_model.wv.most_similar('computer',topn=5))
print("10-demensional vector for 'computer':", our_model.wv['computer'])

Similar to 'computer: [('eps', 0.2914133667945862), ('trees', 0.05541812628507614), ('minors', 0.04264767840504646), ('survey', -0.021763404831290245), ('interface', -0.15233564376831055)]
10-demensional vector for 'computer': [ 0.0163195   0.00189972  0.03474648  0.00217841  0.09621626  0.05062076
 -0.08919986 -0.0704361   0.00901718  0.06394394]


In [24]:
#Step 5 Reflection
print("It is important because if the word is not in the vocabulary or the word dosent exist in vocabulary, we well get a KeyError or exception. And also models are trained on specific corpora, so not all words are presented there.  ")

It is important because if the word is not in the vocabulary or the word dosent exist in vocabulary, we well get a KeyError or exception. And also models are trained on specific corpora, so not all words are presented there.  
