In [1]:
import random
import string
from typing import List, Tuple
import pandas as pd

In [2]:
def add_unicode_noise(text: str) -> str:
    """Add random unicode characters as noise"""
    noise_chars = [
        '\u200B', '\u200C', '\u200D', '\u200E',  # Zero-width characters
        '\u0301', '\u0302', '\u0303',  # Combining diacritical marks
        '\uFEFF', '\u2060',  # Invisible formatting characters
        '\u200F', '\u061C',  # Directional formatting
    ]
    positions = random.sample(range(len(text)), k=random.randint(2, 5))
    for pos in positions:
        text = text[:pos] + random.choice(noise_chars) + text[pos:]
    return text

def add_random_string(text: str) -> str:
    """Insert random long string of characters"""
    random_length = random.randint(10, 30)
    random_string = ''.join(random.choices(
        string.ascii_letters + string.digits + string.punctuation,
        k=random_length
    ))
    position = random.randint(0, len(text))
    return text[:position] + random_string + text[position:]

def duplicate_words(text: str) -> str:
    """Duplicate random words multiple times"""
    words = text.split()
    if len(words) < 2:
        return text
    word_to_duplicate = random.choice(words)
    duplicated = ' '.join([word_to_duplicate] * random.randint(3, 6))
    position = random.randint(0, len(words))
    words.insert(position, duplicated)
    return ' '.join(words)

def add_html_artifacts(text: str) -> str:
    """Add incomplete or malformed HTML-like tags"""
    html_artifacts = [
        '<div>', '</div>', '<p>', '</p>',
        '<span style="color: red;">', '<!-- comment -->',
        '&nbsp;', '&amp;', '&lt;', '&gt;'
    ]
    return text + ' ' + random.choice(html_artifacts)

def add_control_characters(text: str) -> str:
    """Add various control characters"""
    control_chars = [
        '\x00', '\x01', '\x02', '\x03', '\x04',  # NULL, SOH, STX, ETX, EOT
        '\x1A', '\x1B', '\x1C', '\x1D',  # SUB, ESC, FS, GS
        '\r\r\n', '\r\n\r\n', '\n\r'  # Mixed line endings
    ]
    return text + random.choice(control_chars)

def add_encoding_artifacts(text: str) -> str:
    """Add characters that often appear due to encoding issues"""
    encoding_artifacts = [
        'â€™', 'â€"', 'â€œ', 'â€',  # Common UTF-8 mojibake
        'Ã¢', 'Ã©', 'Ã±', 'Ã¼',  # Latin-1 to UTF-8 conversion artifacts
        'ï»¿',  # BOM marker
    ]
    return random.choice(encoding_artifacts) + text

def add_whitespace_corruption(text: str) -> str:
    """Add problematic whitespace patterns"""
    whitespace_patterns = [
        '\t\t\t',  # Multiple tabs
        ' ' * random.randint(5, 10),  # Multiple spaces
        '\u2000', '\u2001', '\u2002',  # Different Unicode spaces
        '\u00A0', '\u1680',  # Non-breaking space, Ogham space
    ]
    return text.replace(' ', random.choice(whitespace_patterns))

In [3]:

# Read original text
# with open(text_file, 'r', encoding='utf-8') as f:
#     lines = f.readlines()

noise_functions = [
    add_unicode_noise,
    add_random_string,
    duplicate_words,
    add_html_artifacts,
    add_control_characters,
    add_encoding_artifacts,
    add_whitespace_corruption
]

# # Write noisy text to new file
# with open(output_file, 'w', encoding='utf-8') as f:
#     f.writelines(noisy_lines)

In [4]:
df_entire = pd.read_csv("/home/snp2453/DBMS/train_v3_drcat_01.csv")
df = df_entire.sample(1000)
df.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
34726,"Dear Principal,\n\nI think Policy 1 is the way...",1,Cell phones at school,llama_70b_v1,False
45076,\n\nTitle: Imagining Alternative Transportatio...,1,Car-free cities,Intel-neural-chat-7b-v3-1_LLMEssays_v1,True
49343,Introduction:\nThe Face on Mars is a well-kno...,1,The Face on Mars,Mistral7B_CME_v7,True
19888,"Dear State Senator,\n\nThe Electoral College h...",0,Does the electoral college work?,persuade_corpus,True
4093,Summer assignment is a term most students woul...,0,Summer projects,persuade_corpus,False


In [5]:
noise_probability = 0.4
noisy_text_all = []

for i in range(len(df)):
    noisy_lines = []
    text = str(df.iloc[i]['text'])
    for line in text.split('\n'):
        try:
            if line.strip() and random.random() < noise_probability:
                num_noise = random.randint(1, 3)
                selected_functions = random.sample(noise_functions, k=num_noise)
                noisy_line = line.strip()
                for noise_func in selected_functions:
                    noisy_line = noise_func(noisy_line)
                noisy_lines.append(noisy_line)
            else:
                noisy_lines.append(line)
        except Exception as e:
            print(e)
            noisy_lines.append(line)
    noisy_text = '\n'.join(noisy_lines)
    noisy_text_all.append(noisy_text)

In [6]:
df['Noisy_text'] = noisy_text_all

In [7]:
df

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,Noisy_text
34726,"Dear Principal,\n\nI think Policy 1 is the way...",1,Cell phones at school,llama_70b_v1,False,"Dear Principal,\n\nI think Policy 1 is the way..."
45076,\n\nTitle: Imagining Alternative Transportatio...,1,Car-free cities,Intel-neural-chat-7b-v3-1_LLMEssays_v1,True,\n\nTitle: Imagining Alternative Transportatio...
49343,Introduction:\nThe Face on Mars is a well-kno...,1,The Face on Mars,Mistral7B_CME_v7,True,Introduc5d@-alp3$@(tion: &lt;\nThe Face on Mar...
19888,"Dear State Senator,\n\nThe Electoral College h...",0,Does the electoral college work?,persuade_corpus,True,"Dear State Senator, &gt;\n\nThe Electoral Coll..."
4093,Summer assignment is a term most students woul...,0,Summer projects,persuade_corpus,False,Summer assignment is a term most students woul...
...,...,...,...,...,...,...
50296,Title: Riding the Waves: A Testimonial from a...,1,"""A Cowboy Who Rode the Waves""",Mistral7B_CME_v7,True,Title: Riding the Waves: A Testimonial from a...
82,Distracted Drivers\n\nMany people claim having...,0,Phones and driving,persuade_corpus,False,Ã±Dîstracted ﻿Dri​vers </div>\n\nMany people ...
13489,Unmasking the Face on Mars has always been an ...,0,The Face on Mars,persuade_corpus,True,Unmasking the Face on Mars has always been an ...
43339,Are you up tight and tense? Well this may be d...,0,Car-free cities,train_essays,True,â€™Are you up tight and tense? Well this may b...


In [8]:
df.to_csv("/home/snp2453/DBMS/train_v3_drcat_01_noisy_1000.csv", index=False)