In [4]:
import pandas as pd
import os


# Split Dataset

In [21]:

data_folder = "coliee_data/task4/COLIEE2022statute_data-English/train"
start_year = 2022
train_files = os.listdir(data_folder)
train_files.sort()

def prep_data(output_file, data):
    data.rename(columns={'t1': 'premise', 't2': 'hypothesis'}, inplace=True)

    # Convert unique string labels to integers
    unique_labels = data['label'].unique()
    data['labels'] = data['label']
    label_to_int = {label: idx for idx, label in enumerate(unique_labels)}
    data['label'] = data['label'].map(label_to_int)

    data.dropna(inplace=True)
    data.reset_index(drop=True, inplace=True)
    data.to_csv(output_file, index=False)

meta_data = {}

print(f"Total number of files: {len(train_files)}")

while len(train_files) > 2:
    print(f"Processing data for year {start_year}")
    print(f"Number of files: {len(train_files)}")
    output_file = f"coliee_data/processed/train/coliee_train_{start_year}.csv"
    files = [pd.read_xml(os.path.join(data_folder, file)) for file in train_files if file.endswith(".xml")]
    data = pd.concat(files, ignore_index=True)
    prep_data(output_file, data)

    start_year -= 1
    last_file = train_files.pop()
    test_file = f"coliee_data/processed/test/coliee_test_{start_year}.csv"
    test_df = pd.read_xml(os.path.join(data_folder, last_file))
    prep_data(test_file, test_df)
    print(f"Test file: {last_file} : Year {start_year}")
    



Total number of files: 15
Processing data for year 2022
Number of files: 15
Test file: riteval_R02_en.xml : Year 2021
Processing data for year 2021
Number of files: 14
Test file: riteval_R01_en.xml : Year 2020
Processing data for year 2020
Number of files: 13
Test file: riteval_H30_en.xml : Year 2019
Processing data for year 2019
Number of files: 12
Test file: riteval_H29_en.xml : Year 2018
Processing data for year 2018
Number of files: 11
Test file: riteval_H28_en.xml : Year 2017
Processing data for year 2017
Number of files: 10
Test file: riteval_H27_en.xml : Year 2016
Processing data for year 2016
Number of files: 9
Test file: riteval_H26_en.xml : Year 2015
Processing data for year 2015
Number of files: 8
Test file: riteval_H25_en.xml : Year 2014
Processing data for year 2014
Number of files: 7
Test file: riteval_H24_en.xml : Year 2013
Processing data for year 2013
Number of files: 6
Test file: riteval_H23_en.xml : Year 2012
Processing data for year 2012
Number of files: 5
Test file

In [5]:
test_2022 = "coliee_data/task4/test-for-task4/TestData_en.xml"
test_2022_df = pd.read_xml(test_2022)
test_file = f"coliee_data/processed/test/coliee_test_2022.csv"
test_2022_df.rename(columns={'t1': 'premise', 't2': 'hypothesis'}, inplace=True)



In [6]:
# read the text file
test_label_file = "coliee_data/task4/test-for-task4/answer-task4"
with open(test_label_file, 'r') as f:
    test_data = f.readlines()

print(test_data)


['R03-01-E\tN\n', 'R03-01-I\tY\n', 'R03-01-O\tN\n', 'R03-02-A\tY\n', 'R03-02-E\tY\n', 'R03-02-I\tN\n', 'R03-02-O\tN\n', 'R03-03-A\tY\n', 'R03-03-E\tN\n', 'R03-03-I\tY\n', 'R03-03-O\tN\n', 'R03-03-U\tY\n', 'R03-04-A\tN\n', 'R03-04-E\tN\n', 'R03-04-I\tY\n', 'R03-04-O\tY\n', 'R03-04-U\tY\n', 'R03-05-A\tN\n', 'R03-05-O\tN\n', 'R03-05-U\tY\n', 'R03-07-A\tN\n', 'R03-07-E\tN\n', 'R03-07-I\tY\n', 'R03-07-O\tY\n', 'R03-08-A\tN\n', 'R03-08-E\tY\n', 'R03-08-O\tY\n', 'R03-08-U\tY\n', 'R03-09-A\tY\n', 'R03-09-E\tY\n', 'R03-09-U\tN\n', 'R03-10-A\tN\n', 'R03-10-O\tY\n', 'R03-10-U\tY\n', 'R03-12-A\tN\n', 'R03-12-E\tY\n', 'R03-12-I\tY\n', 'R03-12-O\tN\n', 'R03-12-U\tN\n', 'R03-13-A\tY\n', 'R03-15-A\tY\n', 'R03-15-E\tN\n', 'R03-15-I\tY\n', 'R03-15-U\tN\n', 'R03-16-E\tY\n', 'R03-16-O\tN\n', 'R03-16-U\tY\n', 'R03-17-I\tY\n', 'R03-17-O\tY\n', 'R03-17-U\tY\n', 'R03-19-A\tY\n', 'R03-19-E\tY\n', 'R03-19-I\tN\n', 'R03-19-O\tY\n', 'R03-19-U\tN\n', 'R03-20-A\tN\n', 'R03-20-E\tY\n', 'R03-20-I\tY\n', 'R03-20-O\tN\

In [8]:
label_dict = {}
for data in test_data:
    label_list = data.split('\t')
    label_dict[label_list[0]] = label_list[1].split('\n')[0]

# convert the label dictionary to a dataframe
label_df = pd.DataFrame.from_dict(label_dict, orient='index')
label_df.reset_index(inplace=True)
label_df.columns = ['id', 'label']

# join the label dataframe with the test dataframe
test_df = test_2022_df.merge(label_df, on='id', how='left')




In [10]:
# Convert unique string labels to integers
unique_labels = test_df['label'].unique()
test_df['labels'] = test_df['label']
label_to_int = {label: idx for idx, label in enumerate(unique_labels)}
test_df['label'] = test_df['label'].map(label_to_int)

test_file = f"coliee_data/processed/test/coliee_test_2022.csv"
test_df.dropna(inplace=True)
test_df.reset_index(drop=True, inplace=True)
test_df.to_csv(test_file, index=False)