In [None]:
def notAUGC(seq:str):
    """
    This function takes a sequence and returns True if the sequence
    contains any character other than A, U, G, or C. Otherwise, it returns False.
    """
    for char in seq:
        if char not in 'AUGC':
            return True
    return False

In [11]:
import pandas as pd

def validate_train_data(seq_csv, labels_csv):
    sequences_df = pd.read_csv(seq_csv)
    labels_df = pd.read_csv(labels_csv)

    # Extract target_id from label IDs like "7KUC_0_1" → "7KUC_0"
    labels_df['target_id'] = labels_df['ID'].astype(str).str.rsplit('_', n=1).str[0]
    label_counts = labels_df['target_id'].value_counts().to_dict()

    mismatches = []

    for _, row in sequences_df.iterrows():
        target_id = row['target_id']
        sequence = row['sequence']

        # Validate sequence is a string of letters only
        if not isinstance(sequence, str) or not sequence.isalpha() or notAUGC(sequence):
            mismatches.append({
                "target_id": target_id,
                "error": "Invalid sequence type or non-letter characters or other than A, U, G, C",
                "sequence": sequence,
                "label_count": label_counts.get(target_id, 0),
                "expected_length": None
            })
            continue

        expected_len = len(sequence)
        actual_count = label_counts.get(target_id, 0)
        if actual_count != expected_len:
            mismatches.append({
                "target_id": target_id,
                "error": "Mismatch in label count",
                "sequence": sequence,
                "label_count": actual_count,
                "expected_length": expected_len
            })

    return pd.DataFrame(mismatches)

# Run the validator and print results
errors = validate_train_data("train_sequences.csv", "train_labels.csv")

if errors.empty:
    print("✅ All sequences are valid and have correct label counts.")
else:
    print("❌ Found mismatches or data issues:\n")
    print(errors.to_string(index=False))  # Pretty print mismatches

❌ Found mismatches or data issues:

target_id                                                                   error                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [12]:
# Remove rows from the training data where target_id appears in errors
train_sequences = pd.read_csv("train_sequences.csv")
train_sequences = train_sequences[~train_sequences['target_id'].isin(errors['target_id'])]
train_sequences.to_csv("train_sequences_cleaned.csv", index=False)

train_labels = pd.read_csv("train_labels.csv")
train_labels['target_id'] = train_labels['ID'].astype(str).str.rsplit('_', n=1).str[0]
train_labels = train_labels[~train_labels['target_id'].isin(errors['target_id'])]
train_labels.drop(columns=['target_id'], inplace=True)
train_labels.to_csv("train_labels_cleaned.csv", index=False)

In [13]:
print(len(errors), "mismatches found.")

200 mismatches found.


In [14]:
import pandas as pd

def validate_train_data(seq_csv, labels_csv):
    sequences_df = pd.read_csv(seq_csv)
    labels_df = pd.read_csv(labels_csv)

    # Extract target_id from label IDs like "7KUC_0_1" → "7KUC_0"
    labels_df['target_id'] = labels_df['ID'].astype(str).str.rsplit('_', n=1).str[0]
    label_counts = labels_df['target_id'].value_counts().to_dict()

    mismatches = []

    for _, row in sequences_df.iterrows():
        target_id = row['target_id']
        sequence = row['sequence']

        # Validate sequence is a string of letters only
        if not isinstance(sequence, str) or not sequence.isalpha():
            mismatches.append({
                "target_id": target_id,
                "error": "Invalid sequence type or non-letter characters",
                "sequence": sequence,
                "label_count": label_counts.get(target_id, 0),
                "expected_length": None
            })
            continue

        expected_len = len(sequence)
        actual_count = label_counts.get(target_id, 0)
        if actual_count != expected_len:
            mismatches.append({
                "target_id": target_id,
                "error": "Mismatch in label count",
                "sequence": sequence,
                "label_count": actual_count,
                "expected_length": expected_len
            })

    return pd.DataFrame(mismatches)

# Run the validator and print results
errors = validate_train_data("train_sequences_cleaned.csv", "train_labels_cleaned.csv")

if errors.empty:
    print("✅ All sequences are valid and have correct label counts.")
else:
    print("❌ Found mismatches or data issues:\n")
    print(errors.to_string(index=False))  # Pretty print mismatches

✅ All sequences are valid and have correct label counts.


In [15]:
print(len(errors), "mismatches found.")

0 mismatches found.


In [7]:
errors.filter()

TypeError: Must pass either `items`, `like`, or `regex`

In [16]:
print(len(train_sequences))

1860
