Q4

In [2]:
from collections import defaultdict

In [5]:
# getting unique tags from the training data
def get_unique_tags(sentence_tags):
    all_tags = set()
    for tags in sentence_tags:
        for tag in tags:
            all_tags.add(tag)
    return all_tags

# writing the final predictions into the dev folder for evaluation
def write_output(path):
    # Training:
    sentences, sentence_tags = read_data(f'{path}/train')
    all_tags = get_unique_tags(sentence_tags)      
    perceptron = Perceptron(all_tags)
    train(perceptron, sentences, sentence_tags, epochs=10)

    # Test prediction:
    with open(f'{path}/dev.in') as f:
        test_data = f.read().splitlines()

    test_sentences = extract_data_test(test_data)
    with open(f'{path}/dev.p4.out', "w") as f:
        for sentence in test_sentences:
            predicted_labels = perceptron.predict(sentence)
            for j in range(len(predicted_labels)):
                f.write(sentence[j] + " " + predicted_labels[j] + "\n")
            else:
                f.write("\n")

# writing the final predictions into the test folder for submission              
def test_write_output(path, dataset_type):
    # training
    if dataset_type == "es":
        file_path = 'Data/ES/train'
    else:
        file_path = 'Data/RU/train'
    sentences, sentence_tags = read_data(file_path)
    all_tags = get_unique_tags(sentence_tags) # getting the unique tags
    perceptron = Perceptron(all_tags)
    train(perceptron, sentences, sentence_tags, epochs=10)

    # testing
    with open(f'{path}/test.in') as f:
        test_data = f.read().splitlines()

    test_sentences = extract_data_test(test_data)
    with open(f'{path}/test.p4.out', "w") as f:
        for sentence in test_sentences:
            predicted_labels = perceptron.predict(sentence)
            for j in range(len(predicted_labels)):
                f.write(sentence[j] + " " + predicted_labels[j] + "\n")
            else:
                f.write("\n")

# To get the sentences and sentence tags from the file
def read_data(file_path):
    sentences = []
    sentence_tags = []
    with open(file_path, 'r') as f:
        sentence, tags = [], []
        for line in f:
            line = line.strip()
            try:
                if line: 
                    token, label = line.split()
                    sentence.append(token)
                    tags.append(label)
                else:
                    sentences.append(sentence)
                    sentence_tags.append(tags)
                    sentence, tags = [], []
            except Exception:
                # handling case for when RU tokens are . .. O and . ... O
                token = line[:-1].strip()
                label = line[-1]
                sentence.append(token)
                tags.append(label)
    return sentences, sentence_tags

def train(perceptron, sentences, sentence_tags, epochs=10000):
    for _ in range(epochs):
        for sentence, tags in zip(sentences, sentence_tags):
            perceptron.update(sentence, tags)

# extracting only sentences from the test data
def extract_data_test(data):
    L=[]
    l = []
    for sentence in data:
        if sentence=="":
            L.append(l)
            l = []
        else:
            l.append(sentence)
    return L

In [6]:

class Perceptron:
    def __init__(self, all_tags, learning_rate=0.001):
        self.tags = list(all_tags)
        self.learning_rate = learning_rate
        self.weights = defaultdict(dict)
        self.bias = defaultdict(float) 
    
    def update(self, sentence, correct_tags):
        pred_tags = self.predict(sentence)
        
        for token, correct_tag, pred_tag in zip(sentence, correct_tags, pred_tags):
            if correct_tag != pred_tag:
                self.weights[correct_tag][token] = self.weights[correct_tag].get(token, 0) + self.learning_rate
                self.weights[pred_tag][token] = self.weights[pred_tag].get(token, 0) - self.learning_rate
                self.bias[correct_tag] += self.learning_rate
                self.bias[pred_tag] -= self.learning_rate
    
    def predict(self, sentence):
        prev_tag = "START"
        pred_tags = []
        for token in sentence:
            scores = {tag: self.weights[tag].get(token, 0) + self.bias[tag] for tag in self.tags}
            pred_tag = max(scores, key=scores.get)
            pred_tags.append(pred_tag)
            prev_tag = pred_tag
        return pred_tags

In [7]:
# predicting for ES dataset:
write_output('Data/ES')
# predicting for RU dataset:
write_output('Data/RU')

## Evaluation Results:

### **ES (Spanish) Results:**
- **Entity Statistics:**
  - Entities in gold data: `229`
  - Entities in prediction: `212`
  - Correct Entity count: `129`
  - **Precision:** `0.6085`
  - **Recall:** `0.5633`
  - **F-score:** `0.5850`

- **Sentiment Statistics:**
  - Correct Sentiment count: `102`
  - **Precision:** `0.4811`
  - **Recall:** `0.4454`
  - **F-score:** `0.4626`

---

### **RU (Russian) Results:**
- **Entity Statistics:**
  - Entities in gold data: `389`
  - Entities in prediction: `473`
  - Correct Entity count: `221`
  - **Precision:** `0.4672`
  - **Recall:** `0.5681`
  - **F-score:** `0.5128`

- **Sentiment Statistics:**
  - Correct Sentiment count: `159`
  - **Precision:** `0.3362`
  - **Recall:** `0.4087`
  - **F-score:** `0.3689`
---


## Running the newly released test set

In [8]:
# Writing output for the newly released test set
# ES
test_write_output('Test/ES', 'es')
# RU
test_write_output('Test/RU', 'ru')