In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import re

def get_first_word(line):
    # Remove leading special characters
    line = re.sub(r'^\W+', '', line)
    # Extract the first word
    words = line.split()
    return words[0] if words else ''

def process_files(src_file, trg_file, match_src, match_trg, diff_src, diff_trg):
    with open(src_file, 'r', encoding='utf-8') as src, \
         open(trg_file, 'r', encoding='utf-8') as trg:
        src_lines = src.readlines()
        trg_lines = trg.readlines()

    matched_src = []
    matched_trg = []
    diff_src_lines = []
    diff_trg_lines = []

    for src_line, trg_line in zip(src_lines, trg_lines):
        src_word = get_first_word(src_line)
        trg_word = get_first_word(trg_line)

        if src_word != trg_word:
            diff_src_lines.append(src_line)
            diff_trg_lines.append(trg_line)
        else:
            matched_src.append(src_line)
            matched_trg.append(trg_line)

    with open(match_src, 'w', encoding='utf-8') as ms, \
         open(match_trg, 'w', encoding='utf-8') as mt:
        ms.writelines(matched_src)
        mt.writelines(matched_trg)

    with open(diff_src, 'w', encoding='utf-8') as ds, \
         open(diff_trg, 'w', encoding='utf-8') as dt:
        ds.writelines(diff_src_lines)
        dt.writelines(diff_trg_lines)

# File paths
source_file = "/content/drive/MyDrive/github-typos.train.src"
target_file = "/content/drive/MyDrive/github-typos.train.tgt"
match_source_file = "/content/drive/MyDrive/match.src"
match_target_file = "/content/drive/MyDrive/match.trg"
diff_source_file = "/content/drive/MyDrive/diff.src"
diff_target_file = "/content/drive/MyDrive/diff.trg"

process_files(source_file, target_file, match_source_file, match_target_file, diff_source_file, diff_target_file)
print("Processing complete. Check match.src and match.trg for matched lines, and diff.src and diff.trg for mismatched lines.")

Processing complete. Check match.src and match.trg for matched lines, and diff.src and diff.trg for mismatched lines.


In [None]:
import pandas as pd

def create_mispelled_csv(src_file, trg_file, output_csv):
    with open(src_file, 'r', encoding='utf-8') as src, \
         open(trg_file, 'r', encoding='utf-8') as trg:
        # Read lines from both files
        src_lines = src.readlines()
        trg_lines = trg.readlines()
    
    # Ensure the lengths of both files match
    if len(src_lines) != len(trg_lines):
        print("Warning: Source and target files have different lengths!")
    
    # Create a list of tuples with (mispelled_sentence, correct_sentence)
    data = []
    for src_line, trg_line in zip(src_lines, trg_lines):
        # Strip extra spaces or newline characters
        src_line = src_line.strip()
        trg_line = trg_line.strip()
        data.append((src_line, trg_line))
    
    # Create a DataFrame
    df = pd.DataFrame(data, columns=["Mispelled_Sentence", "Correct_Sentence"])
    
    # Save to CSV
    df.to_csv(output_csv, index=False, encoding='utf-8')
    print(f"✅ CSV file '{output_csv}' has been created!")

# File paths
src_file = "/content/drive/MyDrive/match(3).src"
trg_file = "/content/drive/MyDrive/match(3).trg"
output_csv = "/content/drive/MyDrive/mispelled_sentences_git.csv"

# Call the function to generate the CSV
create_mispelled_csv(src_file, trg_file, output_csv)

