#### About

> Dependency Parsing

Dependency parsing is a natural language processing (NLP) technique that involves analyzing the grammatical structure of a sentence by identifying the relationships or dependencies between the words in the sentence. It represents the syntactic structure of a sentence as a directed acyclic graph (DAG), where the words are the nodes and the dependencies between the words are the edges.



Example - 

For example, consider the sentence: "The cat chased the mouse." The dependency parse tree for this sentence would have "cat" and "mouse" as dependent nodes, and "chased" as the governing node. The edge between "cat" and "chased" would be labeled as "subject," indicating that "cat" is the subject of the verb "chased." Similarly, the edge between "mouse" and "chased" would be labeled as "object," indicating that "mouse" is the object of the verb "chased."



Dataset - UniversalDependencies(https://universaldependencies.org/)

Using spacy for Dependency parsing

In [6]:
import spacy
#load the spacy model
nlp = spacy.load("en_core_web_sm")
#input sentence
sentence = "The cat chased the mouse."

# preprocess the sentence with spacy
doc = nlp(sentence)

In [10]:
#extract the dependency parse tree

for token in doc:
  print("Word >>", token.text," >> Lemma >>", token.lemma_,">>  POS tag:>>", token.pos_, "  Dependency:>>", token.dep_, "  Head:>>", token.head.text)

Word >> The  >> Lemma >> the >>  POS tag:>> DET   Dependency:>> det   Head:>> cat
Word >> cat  >> Lemma >> cat >>  POS tag:>> NOUN   Dependency:>> nsubj   Head:>> chased
Word >> chased  >> Lemma >> chase >>  POS tag:>> VERB   Dependency:>> ROOT   Head:>> chased
Word >> the  >> Lemma >> the >>  POS tag:>> DET   Dependency:>> det   Head:>> mouse
Word >> mouse  >> Lemma >> mouse >>  POS tag:>> NOUN   Dependency:>> dobj   Head:>> chased
Word >> .  >> Lemma >> . >>  POS tag:>> PUNCT   Dependency:>> punct   Head:>> chased


####  Training a custom dependency parsing model using PyTorch

In [4]:
!pip install -U torch==1.8.0 torchtext==0.9.0

# Reload environment
exit()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.8.0
  Downloading torch-1.8.0-cp39-cp39-manylinux1_x86_64.whl (735.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m735.5/735.5 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchtext==0.9.0
  Downloading torchtext-0.9.0-cp39-cp39-manylinux1_x86_64.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 2.0.0+cu118
    Uninstalling torch-2.0.0+cu118:
      Successfully uninstalled torch-2.0.0+cu118
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.15.1
    Uninstalling torchtext-0.15.1:
      Successfully uninstalled torchtext-0.15.1
[31mERROR: pip's dependency resolver does not currently take into account all the

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.data import Field, Dataset, Example
from torchtext.legacy.data.iterator import BucketIterator


In [3]:
# Define the custom dependency parsing model

class DependencyParserModel(nn.Module):
  def __init__(self, input_dim,hidden_dim):
    super().__init__()
    self.input_dim = input_dim
    self.hidden_dim=hidden_dim
    self.linear1 = nn.Linear(input_dim,hidden_dim)
    self.linear2 = nn.Linear(hidden_dim,hidden_dim)
    self.linear3 = nn.Linear(hidden_dim,1)

  def forward(self,x):
    x = torch.relu(self.linear1(x))
    x = torch.relu(self.linear2(x))
    x = self.linear3(x)
    return x

In [29]:
# Defining custom dataet for dependency parsing
class DependencyParserDataset(Dataset):
    def __init__(self, examples, text_field, head_field):
        fields = [('text', text_field), ('head', head_field)]
        super().__init__(examples, fields)
  

  

In [30]:
# Define the training loop
def train(model, iterator, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch in iterator:
        inputs = batch.text.to(device)
        targets = batch.head.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.squeeze(1)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(iterator)

In [31]:
# Define hyperparameters
input_dim = 300 # Input dimension (e.g., word embeddings)
hidden_dim = 64 # Hidden dimension of the model
batch_size = 32 # Batch size for training
learning_rate = 0.001 # Learning rate for optimizer
epochs = 10 # Number of epochs for training


In [32]:
# Define the fields for the data
TEXT = Field(sequential=True, use_vocab=True)
HEAD = Field(sequential=False, use_vocab=False)

In [33]:
# Load the Universal Dependencies dataset manually
train_file_path = '/content/en_atis-ud-train.conllu'
valid_file_path = '/content/en_atis-ud-dev.conllu'

In [63]:
# Read the dataset file
train_file = open(train_file_path, 'r', encoding='utf-8')
valid_file = open(valid_file_path, 'r', encoding='utf-8')


In [64]:
for line in train_file:
  print(line)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
4	a	a	DET	_	PronType=Art	6	det	_	_

5	nonstop	nonstop	NOUN	_	Number=Sing	6	compound	_	_

6	flight	flight	NOUN	_	Number=Sing	3	obj	_	_

7	from	from	ADP	_	_	8	case	_	_

8	indianapolis	Indianapolis	PROPN	_	Number=Sing	6	nmod	_	_

9	to	to	ADP	_	_	10	case	_	_

10	san	San	PROPN	_	Number=Sing	6	nmod	_	_

11	diego	Diego	PROPN	_	Number=Sing	10	flat	_	_

12	that	that	ADP	_	_	13	mark	_	_

13	serves	serve	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	6	acl:relcl	_	_

14	dinner	dinner	NOUN	_	Number=Sing	13	obj	_	_

15	what	what	PRON	_	PronType=Int,Rel	17	nsubj	_	_

16	's	be	AUX	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	17	cop	_	_

17	available	available	ADJ	_	Degree=Pos	3	parataxis	_	_



# sent_id = 1010.train

# text = what is the earliest breakfast flight from philadelphia to fort worth

1	what	what	PRON	_	PronType=Int,Rel	0	root	_	_

2	is	be	AUX	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	1

KeyboardInterrupt: ignored

In [60]:
# Create examples from the dataset
train_examples = [Example.fromlist([line.strip().split('\t')[:2]], fields=[('text', TEXT), ('head', HEAD)]) for line in train_file]
valid_examples = [Example.fromlist([line.strip().split('\t')[:2]], fields=[('text', TEXT), ('head', HEAD)]) for line in valid_file]

# Close the dataset file
train_file.close()
valid_file.close()

# Create a custom dataset class
class DependencyParserDataset(Dataset):
    def __init__(self, examples, fields):
        super().__init__(examples, fields)

# Create the datasets
train_dataset = DependencyParserDataset(train_examples, fields=[('text', TEXT), ('head', HEAD)])
valid_dataset = DependencyParserDataset(valid_examples, fields=[('text', TEXT), ('head', HEAD)])

# Create the data loaders
train_iterator = BucketIterator(train_dataset, batch_size=batch_size, shuffle=True, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
valid_iterator = BucketIterator(valid_dataset, batch_size=batch_size, shuffle=True, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# Access batch elements
for batch in train_iterator:
  print(batch.text)
  print(batch.head)
  break

AttributeError: ignored

In [47]:
# Create examples from the dataset
train_examples = [Example.fromlist([line.strip().split('\t')[:2]], fields=[('text', TEXT), ('head', HEAD)]) for line in train_file]
valid_examples = [Example.fromlist([line.strip().split('\t')[:2]], fields=[('text', TEXT), ('head', HEAD)]) for line in valid_file]


In [48]:
# Close the dataset file
train_file.close()
valid_file.close()


In [49]:
# Extract tokens from examples
train_tokens = [token for example in train_examples for token in example.text]
valid_tokens = [token for example in valid_examples for token in example.text]


In [50]:
train_tokens[1]

'# text = what is the cost of a round trip flight from pittsburgh to atlanta beginning on april twenty fifth and returning on may sixth'

In [51]:
# Build the vocabulary
TEXT.build_vocab(train_tokens, min_freq=1)  # You can specify min_freq to control the minimum frequency of words in the vocabulary


In [52]:
# Create the datasets
train_dataset = DependencyParserDataset(train_examples, text_field=TEXT, head_field=HEAD)
valid_dataset = DependencyParserDataset(valid_examples, text_field=TEXT, head_field=HEAD)


In [53]:
# Create the data loaders
train_iterator = BucketIterator(train_dataset, batch_size=batch_size, shuffle=True, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
valid_iterator = BucketIterator(valid_dataset, batch_size=batch_size, shuffle=True, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

In [55]:
for batch in train_iterator:
  print(batch)
  break
  

AttributeError: ignored