# ANLI - Data Preprocessing

This notebook handles data preprocessing for the ANLI dataset.
Note: ANLI is already quite well-preprocessed, so minimal cleaning was needed.

Focus:
- Verify data quality
- Check for missing values
- Prepare data loaders for modeling

In [10]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
from data_loader import ANLIDataLoader

In [11]:
# Load data
loader = ANLIDataLoader()
train_df, dev_df, test_df = loader.load_data()

Loading ANLI dataset:
Train size: 45460
Dev size: 1000
Test size: 1000


In [12]:
# Check for missing values
print("Missing values check:")
print("\nTrain set:")
print(train_df.isnull().sum())
print("\nDev set:")
print(dev_df.isnull().sum())
print("\nTest set:")
print(test_df.isnull().sum())

Missing values check:

Train set:
uid           0
premise       0
hypothesis    0
label         0
reason        0
dtype: int64

Dev set:
uid           0
premise       0
hypothesis    0
label         0
reason        0
dtype: int64

Test set:
uid           0
premise       0
hypothesis    0
label         0
reason        0
dtype: int64


In [13]:
# Check for duplicate entries
print("\nDuplicate check:")
print(f"Train duplicates: {train_df.duplicated(subset=['premise', 'hypothesis']).sum()}")
print(f"Dev duplicates: {dev_df.duplicated(subset=['premise', 'hypothesis']).sum()}")
print(f"Test duplicates: {test_df.duplicated(subset=['premise', 'hypothesis']).sum()}")


Duplicate check:
Train duplicates: 31
Dev duplicates: 0
Test duplicates: 0


In [14]:
# Check label distribution
print("\nLabel distribution:")
print("\nTrain:")
print(train_df['label'].value_counts().sort_index())
print("\nDev:")
print(dev_df['label'].value_counts().sort_index())
print("\nTest:")
print(test_df['label'].value_counts().sort_index())


Label distribution:

Train:
label
0    14448
1    20959
2    10053
Name: count, dtype: int64

Dev:
label
0    334
1    333
2    333
Name: count, dtype: int64

Test:
label
0    334
1    333
2    333
Name: count, dtype: int64


In [15]:
# Prepare text pairs
train_premises, train_hypotheses = loader.prepare_text_pairs(train_df)
dev_premises, dev_hypotheses = loader.prepare_text_pairs(dev_df)
test_premises, test_hypotheses = loader.prepare_text_pairs(test_df)

train_labels = train_df['label'].values
dev_labels = dev_df['label'].values
test_labels = test_df['label'].values

print(f"\nPrepared {len(train_labels)} training pairs")
print(f"Prepared {len(dev_labels)} dev pairs")
print(f"Prepared {len(test_labels)} test pairs")


Prepared 45460 training pairs
Prepared 1000 dev pairs
Prepared 1000 test pairs


In [16]:
# Verify data integrity
print("\nData integrity check:")
print(f"All labels in valid range: {all((train_df['label'] >= 0) & (train_df['label'] <= 2))}")
print(f"No empty premises: {all(train_df['premise'].str.len() > 0)}")
print(f"No empty hypotheses: {all(train_df['hypothesis'].str.len() > 0)}")


Data integrity check:
All labels in valid range: True
No empty premises: True
No empty hypotheses: True


In [17]:
# Save preprocessed data (optional)
train_df.to_csv('../data/processed/train_r2.csv', index=False)
dev_df.to_csv('../data/processed/dev_r2.csv', index=False)
test_df.to_csv('../data/processed/test_r2.csv', index=False)

print("\n Preprocessed data saved to ../data/processed/")


 Preprocessed data saved to ../data/processed/
