In [350]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer

In [351]:
data = {
    "Text":["The cat sat on the good mat",
    "The dog lay on the good rug",
    "The cat chased the mouse",
    "Apple is a good, but I prefer Orange products.",
    "The dog barked at the cat",
    "cat was drinking the milk",
    "peter, was playing with dog",
    "Tesla is going to acquire twitter for $45 billion",
    "Michael Blooberg founded Blommerg L.P. in 1982"],
    "label":[0,1,0,1,1,0,0,1,1]
}

In [390]:
df=pd.DataFrame(data)
df

Unnamed: 0,Text,label
0,The cat sat on the good mat,0
1,The dog lay on the good rug,1
2,The cat chased the mouse,0
3,"Apple is a good, but I prefer Orange products.",1
4,The dog barked at the cat,1
5,cat was drinking the milk,0
6,"peter, was playing with dog",0
7,Tesla is going to acquire twitter for $45 billion,1
8,Michael Blooberg founded Blommerg L.P. in 1982,1


### Taking out the corpus

In [391]:
corpus = set(df["Text"])
corpus

{'Apple is a good, but I prefer Orange products.',
 'Michael Blooberg founded Blommerg L.P. in 1982',
 'Tesla is going to acquire twitter for $45 billion',
 'The cat chased the mouse',
 'The cat sat on the good mat',
 'The dog barked at the cat',
 'The dog lay on the good rug',
 'cat was drinking the milk',
 'peter, was playing with dog'}

### Tokenization

In [392]:
def tokenization(text):
    return text.split()

df["Token"]=df["Text"].apply(tokenization)

df

Unnamed: 0,Text,label,Token
0,The cat sat on the good mat,0,"[The, cat, sat, on, the, good, mat]"
1,The dog lay on the good rug,1,"[The, dog, lay, on, the, good, rug]"
2,The cat chased the mouse,0,"[The, cat, chased, the, mouse]"
3,"Apple is a good, but I prefer Orange products.",1,"[Apple, is, a, good,, but, I, prefer, Orange, ..."
4,The dog barked at the cat,1,"[The, dog, barked, at, the, cat]"
5,cat was drinking the milk,0,"[cat, was, drinking, the, milk]"
6,"peter, was playing with dog",0,"[peter,, was, playing, with, dog]"
7,Tesla is going to acquire twitter for $45 billion,1,"[Tesla, is, going, to, acquire, twitter, for, ..."
8,Michael Blooberg founded Blommerg L.P. in 1982,1,"[Michael, Blooberg, founded, Blommerg, L.P., i..."


### Lemmatization

In [393]:
def lemmatization(text):
    a=[]
    nlp=spacy.load("en_core_web_md")
    doc=nlp(text)
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        a.append(token.lemma_)
    return " ".join(a)


In [394]:
df["lema"]=df["Text"].apply(lemmatization)
df

Unnamed: 0,Text,label,Token,lema
0,The cat sat on the good mat,0,"[The, cat, sat, on, the, good, mat]",cat sit good mat
1,The dog lay on the good rug,1,"[The, dog, lay, on, the, good, rug]",dog lie good rug
2,The cat chased the mouse,0,"[The, cat, chased, the, mouse]",cat chase mouse
3,"Apple is a good, but I prefer Orange products.",1,"[Apple, is, a, good,, but, I, prefer, Orange, ...",Apple good prefer Orange product
4,The dog barked at the cat,1,"[The, dog, barked, at, the, cat]",dog bark cat
5,cat was drinking the milk,0,"[cat, was, drinking, the, milk]",cat drink milk
6,"peter, was playing with dog",0,"[peter,, was, playing, with, dog]",peter play dog
7,Tesla is going to acquire twitter for $45 billion,1,"[Tesla, is, going, to, acquire, twitter, for, ...",Tesla go acquire twitter $ 45 billion
8,Michael Blooberg founded Blommerg L.P. in 1982,1,"[Michael, Blooberg, founded, Blommerg, L.P., i...",Michael Blooberg found Blommerg L.P. 1982


In [395]:
lemmatized_corpus=set(df["lema"])
lemmatized_corpus

{'Apple good prefer Orange product',
 'Michael Blooberg found Blommerg L.P. 1982',
 'Tesla go acquire twitter $ 45 billion',
 'cat chase mouse',
 'cat drink milk',
 'cat sit good mat',
 'dog bark cat',
 'dog lie good rug',
 'peter play dog'}

In [396]:
df["lema_token"]=df["lema"].apply(tokenization)

df

Unnamed: 0,Text,label,Token,lema,lema_token
0,The cat sat on the good mat,0,"[The, cat, sat, on, the, good, mat]",cat sit good mat,"[cat, sit, good, mat]"
1,The dog lay on the good rug,1,"[The, dog, lay, on, the, good, rug]",dog lie good rug,"[dog, lie, good, rug]"
2,The cat chased the mouse,0,"[The, cat, chased, the, mouse]",cat chase mouse,"[cat, chase, mouse]"
3,"Apple is a good, but I prefer Orange products.",1,"[Apple, is, a, good,, but, I, prefer, Orange, ...",Apple good prefer Orange product,"[Apple, good, prefer, Orange, product]"
4,The dog barked at the cat,1,"[The, dog, barked, at, the, cat]",dog bark cat,"[dog, bark, cat]"
5,cat was drinking the milk,0,"[cat, was, drinking, the, milk]",cat drink milk,"[cat, drink, milk]"
6,"peter, was playing with dog",0,"[peter,, was, playing, with, dog]",peter play dog,"[peter, play, dog]"
7,Tesla is going to acquire twitter for $45 billion,1,"[Tesla, is, going, to, acquire, twitter, for, ...",Tesla go acquire twitter $ 45 billion,"[Tesla, go, acquire, twitter, $, 45, billion]"
8,Michael Blooberg founded Blommerg L.P. in 1982,1,"[Michael, Blooberg, founded, Blommerg, L.P., i...",Michael Blooberg found Blommerg L.P. 1982,"[Michael, Blooberg, found, Blommerg, L.P., 1982]"


### Vocabulary

In [397]:
from sklearn.feature_extraction.text import CountVectorizer


v=CountVectorizer(ngram_range=(1,1))
v.fit(df["lema"])
vocab=v.vocabulary_
vocab

{'cat': 8,
 'sit': 26,
 'good': 14,
 'mat': 16,
 'dog': 10,
 'lie': 15,
 'rug': 25,
 'chase': 9,
 'mouse': 19,
 'apple': 3,
 'prefer': 23,
 'orange': 20,
 'product': 24,
 'bark': 4,
 'drink': 11,
 'milk': 18,
 'peter': 21,
 'play': 22,
 'tesla': 27,
 'go': 13,
 'acquire': 2,
 'twitter': 28,
 '45': 1,
 'billion': 5,
 'michael': 17,
 'blooberg': 7,
 'found': 12,
 'blommerg': 6,
 '1982': 0}

# POS

In [398]:
nlp=spacy.load("en_core_web_sm")
for sentence in corpus:
    doc=nlp(sentence)
    for token in doc:
        print(token,"|",token.pos_,"| ", spacy.explain(token.pos_),"| ", token.tag_,"| ", spacy.explain(token.tag_))

cat | NOUN |  noun |  NN |  noun, singular or mass
was | AUX |  auxiliary |  VBD |  verb, past tense
drinking | VERB |  verb |  VBG |  verb, gerund or present participle
the | DET |  determiner |  DT |  determiner
milk | NOUN |  noun |  NN |  noun, singular or mass
Tesla | PROPN |  proper noun |  NNP |  noun, proper singular
is | AUX |  auxiliary |  VBZ |  verb, 3rd person singular present
going | VERB |  verb |  VBG |  verb, gerund or present participle
to | PART |  particle |  TO |  infinitival "to"
acquire | VERB |  verb |  VB |  verb, base form
twitter | NOUN |  noun |  NN |  noun, singular or mass
for | ADP |  adposition |  IN |  conjunction, subordinating or preposition
$ | SYM |  symbol |  $ |  symbol, currency
45 | NUM |  numeral |  CD |  cardinal number
billion | NUM |  numeral |  CD |  cardinal number
The | DET |  determiner |  DT |  determiner
dog | NOUN |  noun |  NN |  noun, singular or mass
lay | VERB |  verb |  VBD |  verb, past tense
on | ADP |  adposition |  IN |  conj

### This is problem of pos it was not able to tell that michael blooberg was a person and blommerg was a company

In [399]:
doc=nlp("Michael Blooberg founded Blommerg L.P. in 1982")
for token in doc:
            print(token,"|",token.pos_,"| ", spacy.explain(token.pos_),"| ", token.tag_,"| ", spacy.explain(token.tag_))

Michael | PROPN |  proper noun |  NNP |  noun, proper singular
Blooberg | PROPN |  proper noun |  NNP |  noun, proper singular
founded | VERB |  verb |  VBD |  verb, past tense
Blommerg | PROPN |  proper noun |  NNP |  noun, proper singular
L.P. | PROPN |  proper noun |  NNP |  noun, proper singular
in | ADP |  adposition |  IN |  conjunction, subordinating or preposition
1982 | NUM |  numeral |  CD |  cardinal number


# NER


In [400]:
nlp=spacy.load("en_core_web_sm")
for sentence in corpus:
    doc=nlp(sentence)
    for ent in doc.ents:
        print(ent.text,"|",ent.label_)

Tesla | ORG
$45 billion | MONEY
peter | PERSON
Michael Blooberg | PERSON
Blommerg L.P. | ORG
1982 | DATE
Apple | ORG
Orange | NORP


### This is problem of ner it was not able to tell that apple and orange are fruits not company or nationality as it does not consider context

In [363]:
doc=nlp("Apple is a good, but I prefer Orange products.")
for ent in doc.ents:
        print(ent.text,"|",ent.label_,"|",spacy.explain(ent.label_))

Apple | ORG | Companies, agencies, institutions, etc.
Orange | NORP | Nationalities or religious or political groups


# BAG OF WORDS

In [364]:
v=CountVectorizer(ngram_range=(1,1))
v.fit(lemmatized_corpus)
v.vocabulary_

{'cat': 8,
 'chase': 9,
 'mouse': 19,
 'tesla': 27,
 'go': 13,
 'acquire': 2,
 'twitter': 28,
 '45': 1,
 'billion': 5,
 'michael': 17,
 'blooberg': 7,
 'found': 12,
 'blommerg': 6,
 '1982': 0,
 'dog': 10,
 'bark': 4,
 'apple': 3,
 'good': 14,
 'prefer': 23,
 'orange': 20,
 'product': 24,
 'peter': 21,
 'play': 22,
 'lie': 15,
 'rug': 25,
 'drink': 11,
 'milk': 18,
 'sit': 26,
 'mat': 16}

In [365]:
v.transform(lemmatized_corpus).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1],
       [1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0]])

# N BAG OF WORDS

In [366]:
v=CountVectorizer(ngram_range=(1,3))
v.fit(lemmatized_corpus)
v.vocabulary_

{'cat': 17,
 'chase': 24,
 'mouse': 52,
 'cat chase': 18,
 'chase mouse': 25,
 'cat chase mouse': 19,
 'tesla': 68,
 'go': 36,
 'acquire': 3,
 'twitter': 71,
 '45': 1,
 'billion': 11,
 'tesla go': 69,
 'go acquire': 37,
 'acquire twitter': 4,
 'twitter 45': 72,
 '45 billion': 2,
 'tesla go acquire': 70,
 'go acquire twitter': 38,
 'acquire twitter 45': 5,
 'twitter 45 billion': 73,
 'michael': 48,
 'blooberg': 14,
 'found': 33,
 'blommerg': 12,
 '1982': 0,
 'michael blooberg': 49,
 'blooberg found': 15,
 'found blommerg': 34,
 'blommerg 1982': 13,
 'michael blooberg found': 50,
 'blooberg found blommerg': 16,
 'found blommerg 1982': 35,
 'dog': 26,
 'bark': 9,
 'dog bark': 27,
 'bark cat': 10,
 'dog bark cat': 28,
 'apple': 6,
 'good': 39,
 'prefer': 60,
 'orange': 53,
 'product': 63,
 'apple good': 7,
 'good prefer': 41,
 'prefer orange': 61,
 'orange product': 54,
 'apple good prefer': 8,
 'good prefer orange': 42,
 'prefer orange product': 62,
 'peter': 55,
 'play': 58,
 'peter play

In [367]:
v.transform(lemmatized_corpus).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0,

# TF-IDF


In [368]:
from sklearn.feature_extraction.text import TfidfVectorizer
v=TfidfVectorizer()
transformed_output=v.fit_transform(lemmatized_corpus)
print(v.vocabulary_)

{'cat': 8, 'chase': 9, 'mouse': 19, 'tesla': 27, 'go': 13, 'acquire': 2, 'twitter': 28, '45': 1, 'billion': 5, 'michael': 17, 'blooberg': 7, 'found': 12, 'blommerg': 6, '1982': 0, 'dog': 10, 'bark': 4, 'apple': 3, 'good': 14, 'prefer': 23, 'orange': 20, 'product': 24, 'peter': 21, 'play': 22, 'lie': 15, 'rug': 25, 'drink': 11, 'milk': 18, 'sit': 26, 'mat': 16}


In [369]:
transformed_output.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.41701261, 0.64268985,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.64268985,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.40824829, 0.40824829, 0.        , 0.        ,
        0.40824829, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.40824829, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.40824829, 0.40824829],
       [0.4472136 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.4472136 , 0.4472136 , 0.        , 0.        ,
        0.        , 0.        , 0.4472136 , 0.        , 0.        ,
  

In [370]:
all_feature=v.get_feature_names_out()  # gives vocabulary in oder
all_feature

array(['1982', '45', 'acquire', 'apple', 'bark', 'billion', 'blommerg',
       'blooberg', 'cat', 'chase', 'dog', 'drink', 'found', 'go', 'good',
       'lie', 'mat', 'michael', 'milk', 'mouse', 'orange', 'peter',
       'play', 'prefer', 'product', 'rug', 'sit', 'tesla', 'twitter'],
      dtype=object)

In [371]:
for word in all_feature:
    indx=v.vocabulary_.get(word)
    score=v.idf_[indx] #IDF tells us the weight or importance of a word across all documents.
    print(word,score)

1982 2.6094379124341005
45 2.6094379124341005
acquire 2.6094379124341005
apple 2.6094379124341005
bark 2.6094379124341005
billion 2.6094379124341005
blommerg 2.6094379124341005
blooberg 2.6094379124341005
cat 1.6931471805599454
chase 2.6094379124341005
dog 1.916290731874155
drink 2.6094379124341005
found 2.6094379124341005
go 2.6094379124341005
good 1.916290731874155
lie 2.6094379124341005
mat 2.6094379124341005
michael 2.6094379124341005
milk 2.6094379124341005
mouse 2.6094379124341005
orange 2.6094379124341005
peter 2.6094379124341005
play 2.6094379124341005
prefer 2.6094379124341005
product 2.6094379124341005
rug 2.6094379124341005
sit 2.6094379124341005
tesla 2.6094379124341005
twitter 2.6094379124341005


In [372]:
transformed_output.toarray() # constructing tfid for each sentence. using index position
                             # score is higher for rare element and lower for frequent occuring
    #TF-IDF combines IDF with the term’s frequency in a specific document to calculate its relevance in that document.

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.41701261, 0.64268985,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.64268985,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.40824829, 0.40824829, 0.        , 0.        ,
        0.40824829, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.40824829, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.40824829, 0.40824829],
       [0.4472136 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.4472136 , 0.4472136 , 0.        , 0.        ,
        0.        , 0.        , 0.4472136 , 0.        , 0.        ,
  

In [419]:
def compute_vector(text):
    return nlp(text).vector # Returns the dense vector representation (embedding) of the entire input text. This vector is typically a fixed-dimensional representation of the text computed by averaging the word embeddings of individual tokens (depending on the model).

# Apply the function to the 'text' column
df['tfidf_vector'] = df['lema'].apply(compute_vector)
df 

Unnamed: 0,Text,label,Token,lema,lema_token,sentence_embedding,tfidf_vector
0,The cat sat on the good mat,0,"[The, cat, sat, on, the, good, mat]",cat sit good mat,"[cat, sit, good, mat]","[-4.296632867543307, -3.4300067188059913, -1.9...","[0.44658685, -0.69135404, -0.0017491952, -0.51..."
1,The dog lay on the good rug,1,"[The, dog, lay, on, the, good, rug]",dog lie good rug,"[dog, lie, good, rug]","[-4.738853494951768, -3.2760543845549246, -3.1...","[0.5090586, -0.9326261, 0.4606844, -0.08693685..."
2,The cat chased the mouse,0,"[The, cat, chased, the, mouse]",cat chase mouse,"[cat, chase, mouse]","[1.1724748867906267, 0.550760702030434, 1.0257...","[-0.35758713, -0.9874146, 0.6990876, 0.2098652..."
3,"Apple is a good, but I prefer Orange products.",1,"[Apple, is, a, good,, but, I, prefer, Orange, ...",Apple good prefer Orange product,"[Apple, good, prefer, Orange, product]","[-14.809958700664838, -11.215513791799088, -11...","[-0.0028217435, -0.9776285, -0.17968778, 0.142..."
4,The dog barked at the cat,1,"[The, dog, barked, at, the, cat]",dog bark cat,"[dog, bark, cat]","[0.27359209231933174, 0.2635787607001398, 0.50...","[-0.08661825, -1.0619875, 0.44906196, 0.564633..."
5,cat was drinking the milk,0,"[cat, was, drinking, the, milk]",cat drink milk,"[cat, drink, milk]","[0.6922010044068547, 0.36107065406917616, 1.81...","[-0.5067006, -0.69687396, 0.07482315, -0.10917..."
6,"peter, was playing with dog",0,"[peter,, was, playing, with, dog]",peter play dog,"[peter, play, dog]","[-0.16176466128310363, 0.37738073174769066, -0...","[-0.18020652, -0.7468036, 0.08205422, 0.309880..."
7,Tesla is going to acquire twitter for $45 billion,1,"[Tesla, is, going, to, acquire, twitter, for, ...",Tesla go acquire twitter $ 45 billion,"[Tesla, go, acquire, twitter, $, 45, billion]","[-9.027906575971008, -6.167464441584448, -6.99...","[-0.5984076, -0.35710755, 0.51680726, 0.033701..."
8,Michael Blooberg founded Blommerg L.P. in 1982,1,"[Michael, Blooberg, founded, Blommerg, L.P., i...",Michael Blooberg found Blommerg L.P. 1982,"[Michael, Blooberg, found, Blommerg, L.P., 1982]","[-23.37997013001771, -16.96360514470485, -17.4...","[0.08060862, -0.62851846, 0.42036834, 0.212501..."


In [374]:
from sklearn.model_selection import train_test_split
x = np.array(df['tfidf_vector'].to_list())  # Convert to 2D numpy array
y = df['label'].values 
x_train,x_test,y_train,y_test=train_test_split(
    x,y,
    test_size=0.3,
    random_state=2020,
    
)
from sklearn.neighbors import KNeighborsClassifier
clf=KNeighborsClassifier()
clf.fit(x_train,y_train)

KNeighborsClassifier()

In [375]:
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.33      1.00      0.50         1
           1       0.00      0.00      0.00         2

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### This fails in case of similar words it creates different vector score for two similar meaning word


#### good and great have same meaning but still it is not able to fill great instead of good 

In [376]:
new_sentence = "cat sat on the great mat"  # was not able to gain context 

# Transform the new sentence using the fitted vectorizer
new_sentence_vector = v.transform([new_sentence])

# Display the resulting vector
print("New Sentence Vector (TF-IDF):\n", new_sentence_vector.toarray())


New Sentence Vector (TF-IDF):
 [[0.         0.         0.         0.         0.         0.
  0.         0.         0.54431302 0.         0.         0.
  0.         0.         0.         0.         0.8388822  0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]]


In [377]:
new_sentence = "cat sat on the good mat"

# Transform the new sentence using the fitted vectorizer
new_sentence_vector = v.transform([new_sentence])

# Display the resulting vector
print("New Sentence Vector (TF-IDF):\n", new_sentence_vector.toarray())

New Sentence Vector (TF-IDF):
 [[0.         0.         0.         0.         0.         0.
  0.         0.         0.46343118 0.         0.         0.
  0.         0.         0.52450778 0.         0.71422904 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]]


# EMBEDDING CBOW

In [405]:
# Step 2: Generate Training Data for CBOW
def generate_training_data(sentences, window_size=2):
    data = []
    for sentence in sentences:
        for idx, word in enumerate(sentence):
            # Define context window
            context = []
            for offset in range(-window_size, window_size + 1):
                if offset == 0 or idx + offset < 0 or idx + offset >= len(sentence):
                    continue
                context.append(sentence[idx + offset])  # Collect words in context
            data.append((context, word))  # Add context and target word as tuple
    return data

# Example sentences (your tokenized or lemmatized sentences)
sentences =df["lema_token"]

# Define window size (context size)
window_size = 2

# Generate training data using the function
training_data = generate_training_data(sentences, window_size)

# Print the training data (context, target word pairs)
for context, target in training_data:
    print(f"Context: {context} -> Target: {target}")


Context: ['sit', 'good'] -> Target: cat
Context: ['cat', 'good', 'mat'] -> Target: sit
Context: ['cat', 'sit', 'mat'] -> Target: good
Context: ['sit', 'good'] -> Target: mat
Context: ['lie', 'good'] -> Target: dog
Context: ['dog', 'good', 'rug'] -> Target: lie
Context: ['dog', 'lie', 'rug'] -> Target: good
Context: ['lie', 'good'] -> Target: rug
Context: ['chase', 'mouse'] -> Target: cat
Context: ['cat', 'mouse'] -> Target: chase
Context: ['cat', 'chase'] -> Target: mouse
Context: ['good', 'prefer'] -> Target: Apple
Context: ['Apple', 'prefer', 'Orange'] -> Target: good
Context: ['Apple', 'good', 'Orange', 'product'] -> Target: prefer
Context: ['good', 'prefer', 'product'] -> Target: Orange
Context: ['prefer', 'Orange'] -> Target: product
Context: ['bark', 'cat'] -> Target: dog
Context: ['dog', 'cat'] -> Target: bark
Context: ['dog', 'bark'] -> Target: cat
Context: ['drink', 'milk'] -> Target: cat
Context: ['cat', 'milk'] -> Target: drink
Context: ['cat', 'drink'] -> Target: milk
Conte

In [406]:
vocab_size=len(vocab)
def one_hot_encode(word, vocab_size, vocab):

    vec = np.zeros(vocab_size)
    index = vocab.get(word)  # Get index of the word
    if index is not None:
        vec[index] = 1
    return vec

# Step 4: Prepare training data (X_train, y_train)
X_train = []
y_train = []

for context, target in training_data:
    context_vectors = np.mean(
        [one_hot_encode(word, vocab_size, vocab) for word in context],
        axis=0
    )
    X_train.append(context_vectors)  # Average context vectors
    y_train.append(one_hot_encode(target, vocab_size, vocab))  # Target word

X_train = np.array(X_train)
y_train = np.array(y_train)

# Output the one-hot encoded vectors for context and target
print("Vocabulary:", vocab)
print("Example X_train (context vectors):", X_train[:3])  # Show first 3 context vectors
print("Example y_train (target vectors):", y_train[:3]) 


Vocabulary: {'cat': 8, 'sit': 26, 'good': 14, 'mat': 16, 'dog': 10, 'lie': 15, 'rug': 25, 'chase': 9, 'mouse': 19, 'apple': 3, 'prefer': 23, 'orange': 20, 'product': 24, 'bark': 4, 'drink': 11, 'milk': 18, 'peter': 21, 'play': 22, 'tesla': 27, 'go': 13, 'acquire': 2, 'twitter': 28, '45': 1, 'billion': 5, 'michael': 17, 'blooberg': 7, 'found': 12, 'blommerg': 6, '1982': 0}
Example X_train (context vectors): [[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.5        0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.5        0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.33333333 0.         0.         0.
  0.         0.         0.33333333 0.         0.33333333 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.    

In [407]:
# Step 4: Train CBOW Model
embedding_dim = 10
learning_rate = 0.01
epochs = 1000

# Initialize weights
W1 = np.random.rand(vocab_size, embedding_dim)
W2 = np.random.rand(embedding_dim, vocab_size)

# Softmax function
def softmax(x):
    exp_x = np.exp(x - np.max(x))  # Stability trick
    return exp_x / exp_x.sum(axis=0)

# Training loop
for epoch in range(epochs):
    loss = 0
    for x, y in zip(X_train, y_train):
        # Forward pass
        hidden = np.dot(x, W1)  # Input -> Hidden
        output = softmax(np.dot(hidden, W2))  # Hidden -> Output

        # Compute loss (cross-entropy)
        loss += -np.sum(y * np.log(output))

        # Backward pass
        error = output - y
        dW2 = np.outer(hidden, error)
        dW1 = np.outer(x, np.dot(W2, error))

        # Update weights
        W1 -= learning_rate * dW1
        W2 -= learning_rate * dW2

    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch + 1}, Loss: {loss:.4f}")

# Save word embeddings
word_embeddings = {word: W1[vocab[word]] for word in vocab}

Epoch 100, Loss: 72.9180
Epoch 200, Loss: 50.4246
Epoch 300, Loss: 33.4638
Epoch 400, Loss: 24.2882
Epoch 500, Loss: 20.4325
Epoch 600, Loss: 18.3901
Epoch 700, Loss: 17.0111
Epoch 800, Loss: 16.5049
Epoch 900, Loss: 18.0451
Epoch 1000, Loss: 26.0069


In [408]:
word_embeddings

{'cat': array([-1.02517284,  1.19177745,  3.15856402, -0.98837161,  1.23414171,
         1.30423002, -1.40948848, -0.53241064,  1.74360404,  0.16879537]),
 'sit': array([ 1.5174755 , -0.08217617,  0.68999159,  0.34118686,  1.1537998 ,
         2.28059336,  1.68326367,  1.02355811,  2.23224919, -0.97962486]),
 'good': array([-17.85715484, -15.96592237, -14.22101138,  -9.94521133,
        -18.80080221, -10.44396608,  -5.46507517, -11.07840271,
         -8.51001618,  -7.81261581]),
 'mat': array([ 0.17832071,  1.13629421,  2.47881224,  3.0303404 ,  0.08573298,
        -0.0987675 ,  1.383535  , -1.05307498,  0.73770686, -0.4878286 ]),
 'dog': array([ 0.0537585 ,  0.50809972, -0.87019427, -0.77512937,  1.63865629,
        -0.63673703,  1.62594566,  3.56338458,  0.8231708 , -1.9612921 ]),
 'lie': array([-0.36151056,  0.95108762,  0.76684266,  3.46855487,  0.0130086 ,
         1.86410364,  0.76233592,  0.93903157,  1.44259809,  2.2746167 ]),
 'rug': array([-0.79050708,  1.40251749,  1.5980149

In [409]:
def sentence_embedding(sentence, word_embeddings, vocab):
    words = sentence.split()
    embeddings = [word_embeddings[word] for word in words if word in vocab]
    if embeddings:
        return np.mean(embeddings, axis=0)  # Mean pooling
    else:
        return np.zeros(W1.shape[1])  # Fallback for unknown words

# Example usage to generate sentence embeddings
sentence_embeddings = []

for sentence in df["lema"]:
    embedding = sentence_embedding(sentence, word_embeddings, vocab)
    sentence_embeddings.append(embedding)                               
    print(f"Sentence: {sentence}")
    print(f"Sentence Embedding: {embedding}")
df["sentence_embedding"] = sentence_embeddings

Sentence: cat sit good mat
Sentence Embedding: [-4.29663287 -3.43000672 -1.97341088 -1.89051392 -4.08178193 -1.73947755
 -0.95194124 -2.91008255 -0.94911402 -2.27781847]
Sentence: dog lie good rug
Sentence Embedding: [-4.73885349 -3.27605438 -3.18158701 -1.1902737  -4.30305822 -2.18818398
 -0.38657098 -1.34524605 -1.35856503 -2.00806205]
Sentence: cat chase mouse
Sentence Embedding: [ 1.17247489  0.5507607   1.02574567 -1.0717201   0.73098166  1.58927557
 -0.2144447   0.58673265  0.40397146  0.70389924]
Sentence: Apple good prefer Orange product
Sentence Embedding: [-14.8099587  -11.21551379 -11.34549741  -7.14710188 -14.53482487
  -9.09353281  -6.16204652  -8.31959611  -5.38919371  -7.41806264]
Sentence: dog bark cat
Sentence Embedding: [ 0.27359209  0.26357876  0.50015795 -0.28405649  1.46156329  0.46650755
  0.80071114  1.03467287  1.2114702   0.55951912]
Sentence: cat drink milk
Sentence Embedding: [ 0.692201    0.36107065  1.81441387 -1.14047278  1.24186498  0.05939822
  0.2114653

In [421]:
print(df.columns)



Index(['Text', 'label', 'Token', 'lema', 'lema_token', 'sentence_embedding',
       'tfidf_vector'],
      dtype='object')


In [422]:
df

Unnamed: 0,Text,label,Token,lema,lema_token,sentence_embedding,tfidf_vector
0,The cat sat on the good mat,0,"[The, cat, sat, on, the, good, mat]",cat sit good mat,"[cat, sit, good, mat]","[-4.296632867543307, -3.4300067188059913, -1.9...","[0.44658685, -0.69135404, -0.0017491952, -0.51..."
1,The dog lay on the good rug,1,"[The, dog, lay, on, the, good, rug]",dog lie good rug,"[dog, lie, good, rug]","[-4.738853494951768, -3.2760543845549246, -3.1...","[0.5090586, -0.9326261, 0.4606844, -0.08693685..."
2,The cat chased the mouse,0,"[The, cat, chased, the, mouse]",cat chase mouse,"[cat, chase, mouse]","[1.1724748867906267, 0.550760702030434, 1.0257...","[-0.35758713, -0.9874146, 0.6990876, 0.2098652..."
3,"Apple is a good, but I prefer Orange products.",1,"[Apple, is, a, good,, but, I, prefer, Orange, ...",Apple good prefer Orange product,"[Apple, good, prefer, Orange, product]","[-14.809958700664838, -11.215513791799088, -11...","[-0.0028217435, -0.9776285, -0.17968778, 0.142..."
4,The dog barked at the cat,1,"[The, dog, barked, at, the, cat]",dog bark cat,"[dog, bark, cat]","[0.27359209231933174, 0.2635787607001398, 0.50...","[-0.08661825, -1.0619875, 0.44906196, 0.564633..."
5,cat was drinking the milk,0,"[cat, was, drinking, the, milk]",cat drink milk,"[cat, drink, milk]","[0.6922010044068547, 0.36107065406917616, 1.81...","[-0.5067006, -0.69687396, 0.07482315, -0.10917..."
6,"peter, was playing with dog",0,"[peter,, was, playing, with, dog]",peter play dog,"[peter, play, dog]","[-0.16176466128310363, 0.37738073174769066, -0...","[-0.18020652, -0.7468036, 0.08205422, 0.309880..."
7,Tesla is going to acquire twitter for $45 billion,1,"[Tesla, is, going, to, acquire, twitter, for, ...",Tesla go acquire twitter $ 45 billion,"[Tesla, go, acquire, twitter, $, 45, billion]","[-9.027906575971008, -6.167464441584448, -6.99...","[-0.5984076, -0.35710755, 0.51680726, 0.033701..."
8,Michael Blooberg founded Blommerg L.P. in 1982,1,"[Michael, Blooberg, founded, Blommerg, L.P., i...",Michael Blooberg found Blommerg L.P. 1982,"[Michael, Blooberg, found, Blommerg, L.P., 1982]","[-23.37997013001771, -16.96360514470485, -17.4...","[0.08060862, -0.62851846, 0.42036834, 0.212501..."


In [423]:
def get_vector_for_word(word, vocab, word_embeddings, nlp_model):
    if word in vocab:
        return word_embeddings[word]
    else:
        # Use spaCy to find the closest word in vocabulary
        word_doc = nlp_model(word)
        best_match = None
        highest_similarity = -1  # Initialize with a very low similarity
        
        for w in vocab:
            similarity = nlp_model(w).similarity(word_doc)
            print(w,"  ",word_doc,similarity)
            if similarity > highest_similarity:
                highest_similarity = similarity
                best_match = w
        
        print(best_match)
        return word_embeddings[best_match]


In [424]:

# Example usage
new_sentence = "cat sit good mat"
tokens = new_sentence.split()

sentence_vector = np.mean(
    [get_vector_for_word(word, vocab, word_embeddings, nlp) for word in tokens],
    axis=0
)

print("Sentence Vector with OOV Handling:\n", sentence_vector)


Sentence Vector with OOV Handling:
 [-4.29663287 -3.43000672 -1.97341088 -1.89051392 -4.08178193 -1.73947755
 -0.95194124 -2.91008255 -0.94911402 -2.27781847]


In [413]:
new_sentence = "cat sit great mat"
tokens = new_sentence.split()

sentence_vector = np.mean(
    [get_vector_for_word(word, vocab, word_embeddings, nlp) for word in tokens],
    axis=0
)
print("Sentence Vector with OOV Handling:\n", sentence_vector)


cat    great 0.17494932238952837
sit    great 0.24969802926952261
good    great 0.7787824384089056
mat    great 0.38119960248173285
dog    great 0.34885706900178715
lie    great 0.24701933271216805
rug    great 0.2959269896398435
chase    great 0.3264818430676584
mouse    great 0.2715305626544373
apple    great 0.27350819127981346
prefer    great 0.017942998936631305
orange    great 0.4819340458497497
product    great 0.30430471958041877
bark    great 0.37586858167245396
drink    great 0.24296258217542302
milk    great 0.23878569036753866
peter    great 0.29925606235024854
play    great 0.10422958912082037
tesla    great 0.3779552529622165
go    great 0.2376881286750955
acquire    great 0.10534960470122061
twitter    great 0.0950113498008723
45    great 0.22742329913757364
billion    great 0.2691890682415729
michael    great 0.2750678895699965
blooberg    great 0.23578071152692218
found    great 0.27760391549651653
blommerg    great 0.28841831805076884
1982    great 0.08671371948398507

  similarity = nlp_model(w).similarity(word_doc)


# Machine Learning

In [415]:
x = np.array([embedding for embedding in df["sentence_embedding"]])

In [416]:
y = df["label"].values 

In [417]:
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.20, random_state=2020)

# Train Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate model
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
