# Steps taken to clean up the Lang-8 dataset

### Extract raw entries of English learners

In [None]:
import json
import pandas as pd

with open("lang-8-20111007-2.0/lang-8-20111007-2.0/lang-8-20111007-L1-v2.dat", "r") as f:
    lines = f.readlines()

def clean_control_sequences(s: str) -> str:
    return ''.join(
        c for c in s
        if ord(c) >= 32 or c in '\n\r\t'
    )
data = [json.loads(clean_control_sequences(line)) for line in lines]

rows = []
for entry in data:
    learning_language = entry[2]
    if learning_language != "English":
        continue
    native_language = entry[3]
    learner_sentences = entry[4]
    for sentence in learner_sentences:
        rows.append({'native_language': native_language, 'text': sentence})

df = pd.DataFrame(rows)
df.to_csv("Lang-8/raw/all.csv", index=False)

### Filter any non-english entries

In [19]:
import pandas as pd
from langdetect import detect, LangDetectException
from tqdm import tqdm

tqdm.pandas()

def is_english_text(text) -> bool:
    '''Full text for now, first filter'''
    try:
        return detect(str(text)) == 'en'
    except LangDetectException:
        return False

# read dataset
df = pd.read_csv("Lang-8/raw/all.csv")

# split df into english and non-english
mask_is_english = df['text'].progress_apply(is_english_text)

english_df = df[mask_is_english]
non_english_df = df[~mask_is_english]

english_df.to_csv("Lang-8/raw/english_only.csv", index=False)
non_english_df.to_csv("Lang-8/raw/non_english.csv", index=False)

100%|██████████| 3202359/3202359 [1:26:56<00:00, 613.89it/s] 


### Leave only entries with 10 - 512 tokens

In [8]:
from transformers import AutoTokenizer
from tqdm import tqdm

tqdm.pandas()

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

df = pd.read_csv("Lang-8/stehwien_pado/all_clean_split.csv")

token_lengths = df['text'].progress_apply(lambda x: len(tokenizer.tokenize(str(x))))
df = df[token_lengths >= 10]
# df = df[token_lengths <= 512]

df.to_csv("Lang-8/stehwien_pado/long.csv", index=False)

100%|██████████| 1194177/1194177 [00:47<00:00, 25011.69it/s]


### Split entries over 512 tokens and combine entries under 10 tokens

In [17]:
from transformers import AutoTokenizer
from tqdm import tqdm
import pandas as pd
import heapq

tqdm.pandas()

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

df = pd.read_csv("Lang-8/stehwien_pado/all_clean.csv")

# split tokens over 512 into multiple rows
df['token_len'] = df['text'].progress_apply(lambda x: len(tokenizer.tokenize(str(x))))
while True:
    # Mask for rows that are too long
    mask = df['token_len'] > 512
    if not mask.any():
        break
        
    # Separate the good rows from the ones that need splitting
    good_rows = df[~mask]
    to_split = df[mask].copy()
    
    # Lists to hold the new split data
    new_data = []
    
    for _, row in to_split.iterrows():
        tokens = tokenizer.tokenize(str(row['text']))
        mid = len(tokens) // 2
        
        # Create two halves
        first_half_text = tokenizer.convert_tokens_to_string(tokens[:mid])
        second_half_text = tokenizer.convert_tokens_to_string(tokens[mid:])
        
        # Create the two new rows (copying all other fields)
        row_a = row.copy()
        row_a['text'] = first_half_text
        row_a['token_len'] = mid
        
        row_b = row.copy()
        row_b['text'] = second_half_text
        row_b['token_len'] = len(tokens) - mid
        
        new_data.extend([row_a, row_b])
        
    # Rebuild the dataframe for the next iteration
    df = pd.concat([good_rows, pd.DataFrame(new_data)], ignore_index=True)

df = df[['native_language', 'text']]

#combine tokens under 25 into one row
final_rows = []
for lang, group in df.groupby("native_language"):
    # Create a heap of (token_count, unique_id, row_dict)
    # unique_id prevents heapq from comparing dictionaries if token_counts tie
    heap = []
    for i, (_, row) in enumerate(group.iterrows()):
        tokens = tokenizer.tokenize(str(row['text']))
        heapq.heappush(heap, (len(tokens), i, row.to_dict()))
        
    while len(heap) > 1:
        # Get the smallest row
        smallest_len, _, smallest_row = heapq.heappop(heap)
        
        # If the smallest is already big enough, we are done with this group
        if smallest_len >= 25:
            heapq.heappush(heap, (smallest_len, _, smallest_row))
            break
        
        # Get the second smallest to merge with
        next_len, _, next_row = heapq.heappop(heap)
        
        # Merge logic
        merged_text = str(smallest_row['text']) + " " + str(next_row['text'])
        merged_len = smallest_len + next_len # Approximation
        
        # Create the merged row (taking metadata from the first of the pair)
        new_row = smallest_row.copy()
        new_row['text'] = merged_text
        
        # Push back into heap to see if it needs more merging
        heapq.heappush(heap, (merged_len, _, new_row))
        
    # Add everything remaining in the heap to our final list
    for _, _, remaining_row in heap:
        final_rows.append(remaining_row)

    df = pd.DataFrame(final_rows)
    df.to_csv("Lang-8/stehwien_pado/all_clean_split.csv", index=False)

 10%|▉         | 118744/1194150 [00:04<00:41, 26065.34it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (610 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1194150/1194150 [00:45<00:00, 25969.95it/s]


### Inspect

In [24]:
import pandas as pd

df = pd.read_csv("Lang-8/stehwien_pado/sampled.csv")
# df = pd.read_parquet("old/train.parquet")
print(f"Total English entries: {len(df)}")
print("Native language distribution:")
i = 0
for lang, count in df['native_language'].value_counts().items():
    # print(f"  {lang}: {count}")
    print(f'  "{i}": "{lang}",')
    i += 1

Total English entries: 107985
Native language distribution:
  "0": "Japanese",
  "1": "Traditional Chinese",
  "2": "Korean",
  "3": "Russian",
  "4": "Mandarin",
  "5": "Spanish",
  "6": "Cantonese",
  "7": "Vietnamese",
  "8": "Polish",
  "9": "Arabic",
  "10": "Thai",
  "11": "Portuguese(Brazil)",
  "12": "English",
  "13": "Indonesian",
  "14": "French",
  "15": "Italian",
  "16": "German",


### Sample same amount of entries for each language

In [None]:
import pandas as pd

df = pd.read_csv("Lang-8/stehwien_pado/all_clean_split.csv")

# sample between 3000 - 15000 entries for each language, drop languages with less than 7000 entries
sampled_dfs = []
for lang, group in df.groupby('native_language'):
    if len(group) >= 3000:
        sampled_dfs.append(group.sample(n=min(len(group), 15000), random_state=42))
        
sampled_df = pd.concat(sampled_dfs).reset_index(drop=True)
sampled_df.to_csv("Lang-8/stehwien_pado/sampled.csv", index=False)

### Split train and test

In [25]:
df = pd.read_csv("Lang-8/stehwien_pado/sampled.csv")

train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

train_df.to_parquet("Lang-8/stehwien_pado/train.parquet", index=False)
test_df.to_parquet("Lang-8/stehwien_pado/test.parquet", index=False)