### ✅ Standardize Format

In [5]:
import pandas as pd

# File paths
misspelled_file = "artificial.train.src"  # File containing incorrect sentences
corrected_file = "artificial.train.tgt"   # File containing correct sentences
output_csv = "csv_files/standardized_dataset.csv"  # Output CSV file

# Read misspelled sentences
with open(misspelled_file, "r", encoding="utf-8") as file:
    misspelled_sentences = [line.strip() for line in file.readlines() if line.strip()]  # Remove empty lines

# Read corrected sentences
with open(corrected_file, "r", encoding="utf-8") as file:
    corrected_sentences = [line.strip() for line in file.readlines() if line.strip()]  # Remove empty lines

# Ensure both files have the same number of lines
if len(misspelled_sentences) != len(corrected_sentences):
    raise ValueError("Mismatch between number of misspelled and corrected sentences.")

# Create a DataFrame
df = pd.DataFrame({"Misspelled": misspelled_sentences, "Corrected": corrected_sentences})

# Remove duplicates (if any)
df = df.drop_duplicates()

# Save to CSV
df.to_csv(output_csv, index=False)

print(f"Standardized dataset saved to: {output_csv}")


Standardized dataset saved to: csv_files/standardized_dataset.csv


### ✅ Check for Unintended Errors

In [10]:
import re
import pandas as pd
import Levenshtein

# File paths
input_csv = "csv_files/standardized_dataset.csv"  # Input dataset
output_csv = "csv_files/filtered_dataset.csv"  # Output filtered dataset
removed_file = "csv_files/removed_sentences.txt"  # File to store removed sentences

# Function to clean text
def clean_text(text):
    text = str(text).lower().strip()  # Convert to lowercase and remove leading/trailing spaces
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces
    return text

# Load dataset without checking column names
df = pd.read_csv(input_csv, header=0)  # Read with first row as column headers

# Ensure there are at least two columns
if df.shape[1] < 2:
    raise ValueError("The input CSV must contain at least two columns (Incorrect, Corrected).")

# Automatically select the first two columns as Incorrect and Corrected
df.iloc[:, 0] = df.iloc[:, 0].apply(clean_text)  # First column (Incorrect)
df.iloc[:, 1] = df.iloc[:, 1].apply(clean_text)  # Second column (Corrected)

# File to store removed sentences
with open(removed_file, "w", encoding="utf-8") as f:
    f.write("Removed Sentences:\n\n")

# Function to check if a word is unrealistic (Levenshtein distance > 4)
def is_unrealistic_word(misspelled_word, correct_word, threshold=4):
    return Levenshtein.distance(misspelled_word, correct_word) > threshold

# Function to check if a sentence contains unrealistic words
def contains_unrealistic_words(misspelled_sentence, corrected_sentence, threshold=4):
    misspelled_words = misspelled_sentence.split()
    corrected_words = corrected_sentence.split()

    # If sentences have different word counts, return True
    if len(misspelled_words) != len(corrected_words):
        with open(removed_file, "a", encoding="utf-8") as f:
            f.write(f"Removing (Word Count Mismatch):\nMisspelled: {misspelled_sentence}\nCorrected: {corrected_sentence}\n\n")
        return True  # Avoid cases where words are completely different

    # Compare words one by one
    for mw, cw in zip(misspelled_words, corrected_words):
        if is_unrealistic_word(mw, cw, threshold):
            with open(removed_file, "a", encoding="utf-8") as f:
                f.write(f"Removing (Unrealistic Word Found):\nMisspelled: {misspelled_sentence}\nCorrected: {corrected_sentence}\n\n")
            return True  # Sentence contains an unrealistic word
    return False

# Apply filtering
original_size = len(df)
filtered_df = df[~df.apply(lambda row: contains_unrealistic_words(row.iloc[0], row.iloc[1], threshold=4), axis=1)]
filtered_size = len(filtered_df)

# Save filtered dataset
filtered_df.to_csv(output_csv, index=False, header=False)  # Save without headers

# Print summary log
print(f"\nOriginal dataset size: {original_size}")
print(f"Filtered dataset size: {filtered_size}")
print(f"Number of sentences removed: {original_size - filtered_size}")
print(f"Filtered dataset saved to: {output_csv}")
print(f"Removed sentences stored in: {removed_file}")



Original dataset size: 627429
Filtered dataset size: 582326
Number of sentences removed: 45103
Filtered dataset saved to: csv_files/filtered_dataset.csv
Removed sentences stored in: csv_files/removed_sentences.txt
