## Exercise : One-Hot encoding Vector Representation

### Importing Dependencies

In [44]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import spacy

In [45]:
corpus = "Hello! My name is Safae ERAJI, an AI enthusiast. I am a student in AI and Data Science. Data Science and AI are much more fun in application, rather than just reading about it. I recommend you that you explore the world of AI and Data Science."

### Tokenization

In [47]:
sentences = sent_tokenize(corpus)
print("Sentences tokenization :", sentences)

Sentences tokenization : ['Hello!', 'My name is Safae ERAJI, an AI enthusiast.', 'I am a student in AI and Data Science.', 'Data Science and AI are much more fun in application than just thinking about it.', 'I recommend you that you explore the world of AI and Data Science.']


In [14]:
words_by_sentence = []
for i, sentence in enumerate(sentences):
    words = word_tokenize(sentence)
    words_by_sentence.append(words)
    print(f"Words in sentence {i+1}: {words}")

Words in sentence 1: ['Hello', '!']
Words in sentence 2: ['My', 'name', 'is', 'Safae', 'ERAJI', ',', 'an', 'AI', 'enthusiast', '.']
Words in sentence 3: ['I', 'am', 'a', 'student', 'in', 'AI', 'and', 'Data', 'Science', '.']
Words in sentence 4: ['Data', 'Science', 'and', 'AI', 'are', 'much', 'more', 'fun', 'than', 'in', 'application', 'than', 'just', 'thinking', 'about', 'it', '.']
Words in sentence 5: ['I', 'recommend', 'you', 'that', 'you', 'explore', 'the', 'world', 'of', 'AI', 'and', 'Data', 'Science', '.']


### Lowercasing

In [17]:
corpus_lower_case = []
for i, sentence in enumerate(words_by_sentence):
    lower_words = []
    for word in sentence:
        lower_words.append(word.lower())
    print(f"Lower casing in sentence {i+1}:", lower_words)
    corpus_lower_case.append(lower_words)

Lower casing in sentence 1: ['hello', '!']
Lower casing in sentence 2: ['my', 'name', 'is', 'safae', 'eraji', ',', 'an', 'ai', 'enthusiast', '.']
Lower casing in sentence 3: ['i', 'am', 'a', 'student', 'in', 'ai', 'and', 'data', 'science', '.']
Lower casing in sentence 4: ['data', 'science', 'and', 'ai', 'are', 'much', 'more', 'fun', 'than', 'in', 'application', 'than', 'just', 'thinking', 'about', 'it', '.']
Lower casing in sentence 5: ['i', 'recommend', 'you', 'that', 'you', 'explore', 'the', 'world', 'of', 'ai', 'and', 'data', 'science', '.']


### Deleting Stopwords

In [21]:
stop_words = set(stopwords.words('english'))
new_words = []
print("== Without stop words ==\n")
for sentence in corpus_lower_case:
    words=[]
    for word in sentence:
        if word not in stop_words:
            words.append(word)
    new_words.append(words)
    print(words)

== Without stop words ==

['hello', '!']
['name', 'safae', 'eraji', ',', 'ai', 'enthusiast', '.']
['student', 'ai', 'data', 'science', '.']
['data', 'science', 'ai', 'much', 'fun', 'application', 'thinking', '.']
['recommend', 'explore', 'world', 'ai', 'data', 'science', '.']


In [19]:
print(new_words)

[['hello', '!'], ['name', 'safae', 'eraji', ',', 'ai', 'enthusiast', '.'], ['student', 'ai', 'data', 'science', '.'], ['data', 'science', 'ai', 'much', 'fun', 'application', 'thinking', '.'], ['recommend', 'explore', 'world', 'ai', 'data', 'science', '.']]


### Lemmatization

In [29]:
nlp = spacy.load("en_core_web_sm")
corpus_lemms = []
print("== Lemmatization ==")
for token in new_words:
    tok = ' '.join(token)
    #print("tok : ", tok)
    doc = nlp(tok)
    #print("doc : ", doc)
    corpus_lemms.append([t.lemma_ for t in doc])
    print([t.lemma_ for t in doc])

== Lemmatization ==
['hello', '!']
['name', 'safae', 'eraji', ',', 'ai', 'enthusiast', '.']
['student', 'ai', 'data', 'science', '.']
['data', 'science', 'ai', 'much', 'fun', 'application', 'thinking', '.']
['recommend', 'explore', 'world', 'ai', 'datum', 'science', '.']


In [28]:
corpus_lemms

[['hello', '!'],
 ['name', 'safae', 'eraji', ',', 'ai', 'enthusiast', '.'],
 ['student', 'ai', 'data', 'science', '.'],
 ['data', 'science', 'ai', 'much', 'fun', 'application', 'thinking', '.'],
 ['recommend', 'explore', 'world', 'ai', 'datum', 'science', '.']]

### Vocabulary Construction

In [31]:
vocab = set()
for i in corpus_lemms:
    for j in i:
        vocab.add(j) 

print("Vocabulary:", vocab)
print("Vocabulary Size:", len(vocab))

Vocabulaire: {'explore', '!', 'much', 'thinking', 'enthusiast', 'science', 'safae', 'data', 'recommend', 'hello', '.', 'eraji', 'datum', 'fun', 'application', 'student', 'world', 'ai', ',', 'name'}
Taille du vocabulaire: 20


### Vocabulary Indexation

In [32]:
index_vocab = {}

for i, w in enumerate(vocab):
    index_vocab[w] = i

print("Indexed Vocabulary:", index_vocab)

Indexed Vocabulary: {'explore': 0, '!': 1, 'much': 2, 'thinking': 3, 'enthusiast': 4, 'science': 5, 'safae': 6, 'data': 7, 'recommend': 8, 'hello': 9, '.': 10, 'eraji': 11, 'datum': 12, 'fun': 13, 'application': 14, 'student': 15, 'world': 16, 'ai': 17, ',': 18, 'name': 19}


### One-Hot Vectorization 

In [33]:
def create_one_hot(word, vocab_dict=index_vocab, vocab_size=len(vocab)):
    # A one-hot vector for a certain word
    vector = [0] * vocab_size
    if word in vocab_dict:
        vector[vocab_dict[word]] = 1
    return vector

In [34]:
vectors_one_hot=[]
for i in corpus_lemms:
    vector_sentences = []
    for j in i:
        v = create_one_hot(j)
        vector_sentences.append(v)
    vectors_one_hot.append(vector_sentences)

# For the first sentence : 
print(vectors_one_hot[0])

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [43]:
for i, vector in enumerate(vectors_one_hot):
    print(f"\nVector of sentence {i+1} : ", vector)


Vector of sentence 1 :  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

Vector of sentence 2 :  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

Vector of sentence 3 :  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

Vector of sentence 4 :  [[0, 0, 0, 0, 0, 0, 0, 1, 0, 