In [6]:
import pandas as pd

def create_dataframe_from_iob2(filename):
    """
    Create a DataFrame from an IOB2 file containing tokens and labels.

    Parameters:
    filename (str): The path to the IOB2 file.

    Returns:
    pd.DataFrame: DataFrame containing the tokens and labels.
    """
    # Open the IOB2 file and read its contents
    with open(filename, "r", encoding="utf-8") as f:
        # Read the contents of the file
        lines = f.readlines()

    # Initialize lists to store tokens and labels
    tokens_list = []
    labels_list = []

    # Process each line in the file
    for line in lines:
        # Split the line into tokens and their corresponding labels
        tokens = line.strip().split()
        if tokens:
            # Extract token and label
            token = tokens[0]
            label = tokens[-1]
            # Append token and label to respective lists
            tokens_list.append(token)
            labels_list.append(label)

    # Create a DataFrame from the lists of tokens and labels
    df = pd.DataFrame({"Token": tokens_list, "Label": labels_list})
    
    return df


# Call the function to create DataFrames for the specified files
dev_df = create_dataframe_from_iob2("en_ewt-ud-dev.iob2")
test_df = create_dataframe_from_iob2("en_ewt-ud-test-masked.iob2")
train_df = create_dataframe_from_iob2("en_ewt-ud-train.iob2")

# Display the DataFrames
print("Dev DataFrame:")
print(dev_df)
print("\nTest DataFrame:")
print(test_df)
print("\nTrain DataFrame:")
print(train_df)

Dev DataFrame:
      Token                                   Label
0         #       answers-20070404104007AAY1Chs_ans
1         #  answers-20070404104007AAY1Chs_ans-0001
2         #                                 please?
3         1                                       -
4         2                                       -
...     ...                                     ...
29464    21                                       -
29465    22                                       -
29466    23                                       -
29467    24                                       -
29468    25                                       -

[29469 rows x 2 columns]

Test DataFrame:
      Token                                   Label
0         #       answers-20080426140040AA4YiX5_ans
1         #  answers-20080426140040AA4YiX5_ans-0001
2         #                                Miramar?
3         1                                       -
4         2                                       -
...   