In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import re

def get_first_word(line):
    # Remove leading special characters
    line = re.sub(r'^\W+', '', line)
    # Extract the first word
    words = line.split()
    return words[0] if words else ''

def process_files(src_file, trg_file, match_src, match_trg, diff_src, diff_trg):
    with open(src_file, 'r', encoding='utf-8') as src, \
         open(trg_file, 'r', encoding='utf-8') as trg:
        src_lines = src.readlines()
        trg_lines = trg.readlines()

    matched_src = []
    matched_trg = []
    diff_src_lines = []
    diff_trg_lines = []

    for src_line, trg_line in zip(src_lines, trg_lines):
        src_word = get_first_word(src_line)
        trg_word = get_first_word(trg_line)

        if src_word != trg_word:
            diff_src_lines.append(src_line)
            diff_trg_lines.append(trg_line)
        else:
            matched_src.append(src_line)
            matched_trg.append(trg_line)

    with open(match_src, 'w', encoding='utf-8') as ms, \
         open(match_trg, 'w', encoding='utf-8') as mt:
        ms.writelines(matched_src)
        mt.writelines(matched_trg)

    with open(diff_src, 'w', encoding='utf-8') as ds, \
         open(diff_trg, 'w', encoding='utf-8') as dt:
        ds.writelines(diff_src_lines)
        dt.writelines(diff_trg_lines)

# File paths
source_file = "/content/drive/MyDrive/github-typos.train.src"
target_file = "/content/drive/MyDrive/github-typos.train.tgt"
match_source_file = "/content/drive/MyDrive/match.src"
match_target_file = "/content/drive/MyDrive/match.trg"
diff_source_file = "/content/drive/MyDrive/diff.src"
diff_target_file = "/content/drive/MyDrive/diff.trg"

process_files(source_file, target_file, match_source_file, match_target_file, diff_source_file, diff_target_file)
print("Processing complete. Check match.src and match.trg for matched lines, and diff.src and diff.trg for mismatched lines.")

Processing complete. Check match.src and match.trg for matched lines, and diff.src and diff.trg for mismatched lines.


In [None]:
import pandas as pd

def create_mispelled_csv(src_file, trg_file, output_csv):
    with open(src_file, 'r', encoding='utf-8') as src, \
         open(trg_file, 'r', encoding='utf-8') as trg:
        # Read lines from both files
        src_lines = src.readlines()
        trg_lines = trg.readlines()
    
    # Ensure the lengths of both files match
    if len(src_lines) != len(trg_lines):
        print("Warning: Source and target files have different lengths!")
    
    # Create a list of tuples with (mispelled_sentence, correct_sentence)
    data = []
    for src_line, trg_line in zip(src_lines, trg_lines):
        # Strip extra spaces or newline characters
        src_line = src_line.strip()
        trg_line = trg_line.strip()
        data.append((src_line, trg_line))
    
    # Create a DataFrame
    df = pd.DataFrame(data, columns=["Mispelled_Sentence", "Correct_Sentence"])
    
    # Save to CSV
    df.to_csv(output_csv, index=False, encoding='utf-8')
    print(f"✅ CSV file '{output_csv}' has been created!")

# File paths
src_file = "/content/drive/MyDrive/match(3).src"
trg_file = "/content/drive/MyDrive/match(3).trg"
output_csv = "/content/drive/MyDrive/mispelled_sentences_git.csv"

# Call the function to generate the CSV
create_mispelled_csv(src_file, trg_file, output_csv)



In [None]:
def remove_content_after_sentence(input_file, output_file, sentence):
    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            content = file.read()
        
        index = content.find(sentence)
        
        if index != -1:
            content = content[:index + len(sentence)]
        
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(content)
        
        print("Content modified and saved successfully.")
    except FileNotFoundError:
        print("Input file not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_filename = "/content/drive/MyDrive/match.trg"  # Replace with your actual input file name
output_filename = "/content/drive/MyDrive/diff_match.trg"  # Replace with your desired output file name
sentence_to_keep = "Illustrate the operation of HEAP-EXTRACT-MAX on the heap A = [15, 13, 9, 5, 12, 8, 7, 4, 0, 6, 2, 1]."

remove_content_after_sentence(input_filename, output_filename, sentence_to_keep)


In [None]:
def split_content_at_sentence(input_file, output_file1, output_file2, sentence):
    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            content = file.read()

        index = content.find(sentence)

        if index != -1:
            before_content = content[:index + len(sentence)]  # Keep this part in output_file1
            after_content = content[index + len(sentence):]  # Store this part in output_file2

            with open(output_file1, 'w', encoding='utf-8') as file:
                file.write(before_content)

            with open(output_file2, 'w', encoding='utf-8') as file:
                file.write(after_content)

            print("Content successfully split into two files.")
        else:
            print("Sentence not found in the file.")

    except FileNotFoundError:
        print("Input file not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_filename = "/content/drive/MyDrive/match.trg"  # Your input file
output_filename1 = "/content/drive/MyDrive/math_before_sentence.trg"  # File to store content before and including the sentence
output_filename2 = "/content/drive/MyDrive/match_after_sentence.trg"  # File to store content after the sentence

sentence_to_split = "Illustrate the operation of HEAP-EXTRACT-MAX on the heap A = [15, 13, 9, 5, 12, 8, 7, 4, 0, 6, 2, 1]."

split_content_at_sentence(input_filename, output_filename1, output_filename2, sentence_to_split)


In [None]:

def split_content_at_sentence(input_file, output_file1, output_file2, sentence):
    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            content = file.read()

        index = content.find(sentence)

        if index != -1:
            before_content = content[:index + len(sentence)]  # Keep this part in output_file1
            after_content = content[index + len(sentence):]  # Store this part in output_file2

            with open(output_file1, 'w', encoding='utf-8') as file:
                file.write(before_content)

            with open(output_file2, 'w', encoding='utf-8') as file:
                file.write(after_content)

            print("Content successfully split into two files.")
        else:
            print("Sentence not found in the file.")

    except FileNotFoundError:
        print("Input file not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_filename = "/content/drive/MyDrive/match.src"  # Your input file
output_filename1 = "/content/drive/MyDrive/math_before_sentence.src"  # File to store content before and including the sentence
output_filename2 = "/content/drive/MyDrive/match_after_sentence.src"  # File to store content after the sentence

sentence_to_split = "Illustrate the operation of HEAP-EXTRACT-MAX on the heap A = [15, 13, 9, 5, 12, 8, 7, 4, 0, 6, 2, 1]."

split_content_at_sentence(input_filename, output_filename1, output_filename2, sentence_to_split)


In [None]:
import csv

def create_misspelling_dataset(src_file, trg_file, output_csv, remaining_src_file, remaining_trg_file, limit=57100):
    try:
        with open(src_file, 'r', encoding='utf-8') as src, open(trg_file, 'r', encoding='utf-8') as trg:
            src_lines = src.readlines()
            trg_lines = trg.readlines()

        # Split data: First 57,100 lines for CSV, the rest for new files
        csv_src_lines = src_lines[:limit]
        csv_trg_lines = trg_lines[:limit]
        remaining_src_lines = src_lines[limit:]
        remaining_trg_lines = trg_lines[limit:]

        # Write selected lines to CSV
        with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(["misspelled", "corrected"])  # Header row
            for src_line, trg_line in zip(csv_src_lines, csv_trg_lines):
                writer.writerow([src_line.strip(), trg_line.strip()])  # Write one pair per row

        # Write remaining lines to new separate files
        with open(remaining_src_file, 'w', encoding='utf-8') as src, open(remaining_trg_file, 'w', encoding='utf-8') as trg:
            src.writelines(remaining_src_lines)
            trg.writelines(remaining_trg_lines)

        print(f"CSV file '{output_csv}' created with {limit} lines.")
        print(f"Remaining {len(remaining_src_lines)} lines saved in '{remaining_src_file}' and '{remaining_trg_file}'.")

    except FileNotFoundError:
        print("One or both input files not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
src_filename = "/content/drive/MyDrive/match_after_sentence.src"  
trg_filename = "/content/drive/MyDrive/match_after_sentence.trg"  
output_csv_filename = "/content/drive/MyDrive/miss_spelling_dataset_next.csv"  
remaining_src_filename = "/content/drive/MyDrive/remaining_data.src"  # New file for remaining src lines
remaining_trg_filename = "/content/drive/MyDrive/remaining_data.trg"  # New file for remaining trg lines

create_misspelling_dataset(src_filename, trg_filename, output_csv_filename, remaining_src_filename, remaining_trg_filename)


In [None]:
import csv

def merge_csv_files(file1, file2, output_file):
    try:
        total_rows = 0  # Counter for total rows
        
        with open(output_file, 'w', newline='', encoding='utf-8') as out_csv:
            writer = csv.writer(out_csv)

            # Read first file
            with open(file1, 'r', encoding='utf-8') as f1:
                reader1 = csv.reader(f1)
                header1 = next(reader1)  # Read header
                writer.writerow(header1)  # Write header to output file
                
                for row in reader1:
                    writer.writerow(row)
                    total_rows += 1  # Count rows

            # Read second file (skip header)
            with open(file2, 'r', encoding='utf-8') as f2:
                reader2 = csv.reader(f2)
                next(reader2)  # Skip header
                
                for row in reader2:
                    writer.writerow(row)
                    total_rows += 1  # Count rows

        print(f"CSV files merged successfully into '{output_file}'.")
        print(f"Total number of rows in merged file (excluding header): {total_rows}")

    except FileNotFoundError:
        print("One or both input files not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
file1 = "/content/drive/MyDrive/mispelled_sentences_git.csv"
file2 = "/content/drive/MyDrive/miss_spelling_dataset_next.csv"
output_file = "/content/drive/MyDrive/merged_miss_spelling_dataset.csv"

merge_csv_files(file1, file2, output_file)


In [None]:
847 data remaining_src_filename and remaining_trg_filename