In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import os


INPUT_DIR = "input/UD_English-EWT"

In [2]:
import conllu
import itertools


def conllu_to_pd(file_path: str) -> pd.DataFrame:
    print("\tReading data...")
    with open(file_path, "r") as file:
        data = file.read()
    
    print("\tParsing data...")
    conllu_sentences = conllu.parse(data)
    
    print("\tGetting words...")
    words = [[word["form"] for word in sentence] for sentence in tqdm(conllu_sentences)]
    
    print("\tGetting POS tags...")
    pos = [[word["upos"] for word in sentence] for sentence in tqdm(conllu_sentences)]
    
    print("\tGetting Sentence ids...")
    sent_ids = [[sent.metadata["sent_id"]]*len(sent) for sent in tqdm(conllu_sentences)]

    return pd.DataFrame({"words": itertools.chain.from_iterable(words),
                         "pos": itertools.chain.from_iterable(pos),
                         "sent_id": itertools.chain.from_iterable(sent_ids)})

In [3]:
print("Loading training dataset...")
train_df = conllu_to_pd(os.path.join(INPUT_DIR, "en_ewt-ud-train.conllu"))
print("Loading validation dataset...")
val_df = conllu_to_pd(os.path.join(INPUT_DIR, "en_ewt-ud-dev.conllu"))
print("Loading test dataset...")
test_df = conllu_to_pd(os.path.join(INPUT_DIR, "en_ewt-ud-test.conllu"))

print(f"Training data shape: {train_df.shape}\nValidation data shape: {val_df.shape}\nTest data shape: {test_df.shape}")

Loading training dataset...
	Reading data...
	Parsing data...
	Getting words...


  0%|          | 0/12544 [00:00<?, ?it/s]

	Getting POS tags...


  0%|          | 0/12544 [00:00<?, ?it/s]

	Getting Sentence ids...


  0%|          | 0/12544 [00:00<?, ?it/s]

Loading validation dataset...
	Reading data...
	Parsing data...
	Getting words...


  0%|          | 0/2001 [00:00<?, ?it/s]

	Getting POS tags...


  0%|          | 0/2001 [00:00<?, ?it/s]

	Getting Sentence ids...


  0%|          | 0/2001 [00:00<?, ?it/s]

Loading test dataset...
	Reading data...
	Parsing data...
	Getting words...


  0%|          | 0/2077 [00:00<?, ?it/s]

	Getting POS tags...


  0%|          | 0/2077 [00:00<?, ?it/s]

	Getting Sentence ids...


  0%|          | 0/2077 [00:00<?, ?it/s]

Training data shape: (207227, 3)
Validation data shape: (25511, 3)
Test data shape: (25450, 3)


In [4]:
train_df

Unnamed: 0,words,pos,sent_id
0,Al,PROPN,weblog-juancole.com_juancole_20051126063000_EN...
1,-,PUNCT,weblog-juancole.com_juancole_20051126063000_EN...
2,Zaman,PROPN,weblog-juancole.com_juancole_20051126063000_EN...
3,:,PUNCT,weblog-juancole.com_juancole_20051126063000_EN...
4,American,ADJ,weblog-juancole.com_juancole_20051126063000_EN...
...,...,...,...
207222,on,ADP,reviews-319816-0029
207223,my,PRON,reviews-319816-0029
207224,car,NOUN,reviews-319816-0029
207225,),PUNCT,reviews-319816-0029
