# N-Grams

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class NGramLanguageModel:
    def __init__(self, n):
        self.n = n
        self.vectorizer = CountVectorizer(analyzer='word', ngram_range=(n,n))
        self.corpus = []
        self.load_corpus()

    def fit_transform(self):
        return self.vectorizer.fit_transform(self.corpus)
    
    def transform(self):
        return self.vectorizer.transform(self.corpus)

    def load_corpus(self):
        with open('corpus.txt', 'r') as file:
           self.corpus = file.readlines()

    def get_corpus(self):
        return self.corpus

def calcuate_cosine_similarity(matrix, query_v):
    similarities = cosine_similarity(query_v, matrix)
    return similarities

In [17]:
n = 1
ngram_model = NGramLanguageModel(n)


matrix = ngram_model.fit_transform()
query_v = ngram_model.transform()

print(f'{n}-Gram Model: ')
data = matrix.A

print(pd.DataFrame(matrix.A, columns=ngram_model.vectorizer.get_feature_names_out()))
print(query_v.A)

1-Gram Model: 
   000  11  13  1879  1889  1969  19th  ada  against  agra  ...  wall  was  \
0    0   0   0     0     1     0     0    0        0     0  ...     0    1   
1    0   1   0     0     0     1     0    0        0     0  ...     0    0   
2    0   0   0     1     0     0     0    0        0     0  ...     0    1   
3    0   0   0     0     0     0     0    0        0     0  ...     0    0   
4    1   0   1     0     0     0     0    0        1     0  ...     1    1   
5    0   0   0     0     0     0     0    0        0     0  ...     0    0   
6    0   0   0     0     0     0     0    0        0     1  ...     0    0   
7    0   0   0     0     0     0     1    1        0     0  ...     0    0   
8    0   0   0     0     0     0     0    0        0     0  ...     0    0   
9    0   0   0     0     0     0     0    0        0     0  ...     0    0   

   widely  wildebeest  william  with  worked  works  world  writers  
0       0           0        0     0       0      0     

In [22]:
similarities = calcuate_cosine_similarity(matrix, query_v)

data = {'Document': ngram_model.get_corpus(), 'Similarity': similarities[0]}
df = pd.DataFrame(data)

df

Unnamed: 0,Document,Similarity
0,"The Eiffel Tower is located in Paris, France. ...",1.0
1,"In 1969, Neil Armstrong and Buzz Aldrin became...",0.247436
2,"Albert Einstein, a theoretical physicist, deve...",0.308607
3,The Amazon Rainforest is home to a diverse ran...,0.346467
4,The Great Wall of China is a world-renowned fo...,0.279145
5,"William Shakespeare, an English playwright, is...",0.299392
6,"The Taj Mahal, located in Agra, India, is a UN...",0.370479
7,Ada Lovelace is considered the world's first c...,0.279145
8,The Serengeti National Park in Tanzania is fam...,0.370479
9,"Michael Jordan, a basketball legend, played th...",0.168359


# Word Embedding

In [28]:
from nltk import word_tokenize
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

corpus = []
with open('corpus.txt', 'r') as file:
    corpus = file.readlines()

query = 'I was born in 2005 in California, USA, I go to binus university and I am a part-time lab teaching assistant'

tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]
tokenized_query = word_tokenize(query.lower())

model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

similarities = []

for token in tokenized_corpus:
    similarity = model.wv.n_similarity(tokenized_query, token)
    similarities.append(similarity)

df = pd.DataFrame({'Document': corpus, 'Similarity': similarities})
df

Unnamed: 0,Document,Similarity
0,"The Eiffel Tower is located in Paris, France. ...",0.473185
1,"In 1969, Neil Armstrong and Buzz Aldrin became...",0.406962
2,"Albert Einstein, a theoretical physicist, deve...",0.595622
3,The Amazon Rainforest is home to a diverse ran...,0.299586
4,The Great Wall of China is a world-renowned fo...,0.230761
5,"William Shakespeare, an English playwright, is...",0.377185
6,"The Taj Mahal, located in Agra, India, is a UN...",0.620114
7,Ada Lovelace is considered the world's first c...,0.118555
8,The Serengeti National Park in Tanzania is fam...,0.123411
9,"Michael Jordan, a basketball legend, played th...",0.264627


# Grammar Parsing

In [47]:
from nltk import CFG
from nltk.parse import ChartParser

# CFG = set of recursive rules used to generate patterns of strings
def demonstrate_nlp_parsing(sentence, grammar):
    words = nltk.word_tokenize(sentence)
    parser = ChartParser(grammar)

    parses = list(parser.parse(words))
    if parses:
        for tree in parser.parse(sentence.split()):
            print(tree, '\n')
            tree.pretty_print()
    else:
        print("No Parses Found")

nlp_grammar = CFG.fromstring("""
    S -> NP VP
    NP -> Det N | Det N PP | 'i'
    VP -> V NP | V NP PP | V
    Det -> 'the' | 'a' | 'my'
    N -> 'person' | 'wallet' | 'building' | 'gun' | 'fire'
    V -> 'saw' | 'ate' | 'chased' | 'ran' | 'lost' | 'went' | 'have' | 'VBP' | 'caught' | 'is'
    PP -> P NP
    P -> 'in' | 'on' | 'with' | 'from'
    Adj -> 'big' | 'away' | 'huge'
""")

nlp_sentences = [
    'i have a gun',
    'the building is huge',
    'i lost my wallet',
    'the person ran away with my wallet'
]

for i, sentence in enumerate(nlp_sentences):
    print(f'Sentence {i+1}: {sentence}')
    demonstrate_nlp_parsing(sentence, nlp_grammar)
    print()

Sentence 1: i have a gun
(S (NP i) (VP (V have) (NP (Det a) (N gun)))) 

          S             
  ________|___           
 |            VP        
 |    ________|___       
 |   |            NP    
 |   |         ___|___   
 NP  V       Det      N 
 |   |        |       |  
 i  have      a      gun


Sentence 2: the building is huge
No Parses Found

Sentence 3: i lost my wallet
(S (NP i) (VP (V lost) (NP (Det my) (N wallet)))) 

          S                
  ________|___              
 |            VP           
 |    ________|___          
 |   |            NP       
 |   |         ___|____     
 NP  V       Det       N   
 |   |        |        |    
 i  lost      my     wallet


Sentence 4: the person ran away with my wallet
No Parses Found



# Dependency Parsing with SpaCy

In [49]:
import spacy

def extract_named_entities_and_parse_tree(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)

    print("Formatted Dependency Parse Tree:")
    for token in doc:
        print(f'{token.text} --{token.dep_}--> {token.head.text} ({token.pos_})')

extract_named_entities_and_parse_tree('I was born in 2005 in California, USA, I go to binus university and I am a part-time lab teaching assistant')

Formatted Dependency Parse Tree:
I --nsubjpass--> born (PRON)
was --auxpass--> born (AUX)
born --ccomp--> go (VERB)
in --prep--> born (ADP)
2005 --pobj--> in (NUM)
in --prep--> born (ADP)
California --pobj--> in (PROPN)
, --punct--> California (PUNCT)
USA --appos--> California (PROPN)
, --punct--> go (PUNCT)
I --nsubj--> go (PRON)
go --ROOT--> go (VERB)
to --aux--> binus (PART)
binus --advcl--> go (VERB)
university --dobj--> binus (PROPN)
and --cc--> go (CCONJ)
I --nsubj--> am (PRON)
am --conj--> go (AUX)
a --det--> lab (DET)
part --amod--> time (ADJ)
- --punct--> time (PUNCT)
time --compound--> lab (NOUN)
lab --attr--> am (NOUN)
teaching --acl--> lab (VERB)
assistant --dobj--> teaching (NOUN)


# Named Entity Recognition

In [59]:
import spacy

def extract_named_entities(sentence):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(sentence)

    categories = {}
    for ent in doc.ents:
        label = ent.label_
        if label not in categories:
            categories[label] = []
        if ent.text not in categories[label]:
            categories[label].append(ent.text)

    for label, entities in categories.items():
        print(f"{label}: {entities}")

corpus = []
with open('corpus.txt', 'r') as file:
    corpus = file.readlines()

sentence = ' '.join(corpus)
sentence = ''.join([s.replace('\n', '') for s in sentence])

extract_named_entities(sentence)

FAC: ['The Eiffel Tower', 'the moon during', 'The Great Wall of China']
GPE: ['Paris', 'France', 'Germany', 'Juliet', 'Agra', 'India', 'Tanzania']
DATE: ['1889', '1969', '1879', 'the 19th century', 'annual']
PERSON: ['Neil Armstrong', 'Buzz Aldrin', 'Albert Einstein', 'William Shakespeare', 'Romeo', 'Ada Lovelace', 'Analytical Engine', 'Michael Jordan']
ORDINAL: ['first']
LAW: ['Apollo 11']
ORG: ['Amazon', 'UNESCO World Heritage Site', "Charles Babbage's", 'The Serengeti National Park', 'the Chicago Bulls']
LOC: ['Earth']
QUANTITY: ['over 13,000 miles']
LANGUAGE: ['English']
WORK_OF_ART: ['Hamlet', 'The Taj Mahal']
