# Steps taken to clean up the Lang-8 dataset

### Extract raw entries of English learners

In [None]:
import json
import pandas as pd

with open("lang-8-20111007-2.0/lang-8-20111007-2.0/lang-8-20111007-L1-v2.dat", "r") as f:
    lines = f.readlines()

def clean_control_sequences(s: str) -> str:
    return ''.join(
        c for c in s
        if ord(c) >= 32 or c in '\n\r\t'
    )
data = [json.loads(clean_control_sequences(line)) for line in lines]

rows = []
for entry in data:
    learning_language = entry[2]
    if learning_language != "English":
        continue
    native_language = entry[3]
    learner_sentences = entry[4]
    for sentence in learner_sentences:
        rows.append({'native_language': native_language, 'text': sentence})

df = pd.DataFrame(rows)
df.to_csv("Lang-8/raw/all.csv", index=False)

### Filter any non-english entries

In [19]:
import pandas as pd
from langdetect import detect, LangDetectException
from tqdm import tqdm

tqdm.pandas()

def is_english_text(text) -> bool:
    '''Full text for now, first filter'''
    try:
        return detect(str(text)) == 'en'
    except LangDetectException:
        return False

# read dataset
df = pd.read_csv("Lang-8/raw/all.csv")

# split df into english and non-english
mask_is_english = df['text'].progress_apply(is_english_text)

english_df = df[mask_is_english]
non_english_df = df[~mask_is_english]

english_df.to_csv("Lang-8/raw/english_only.csv", index=False)
non_english_df.to_csv("Lang-8/raw/non_english.csv", index=False)

100%|██████████| 3202359/3202359 [1:26:56<00:00, 613.89it/s] 


### Leave only entries with 10 - 512 tokens

In [10]:
from transformers import AutoTokenizer
from tqdm import tqdm
import os

tqdm.pandas()

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

df = pd.read_csv("Lang-8/stehwien_pado/all_clean.csv")

token_lengths = df['text'].progress_apply(lambda x: len(tokenizer.tokenize(str(x))))
df = df[token_lengths >= 10]
df = df[token_lengths <= 512]

df.to_csv("Lang-8/stehwien_pado/sampled.csv", index=False)

 10%|▉         | 118459/1194150 [00:04<00:42, 25425.59it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (610 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1194150/1194150 [00:45<00:00, 25960.73it/s]
  df = df[token_lengths <= 512]


### Inspect

In [None]:
import pandas as pd

df = pd.read_csv("Lang-8/stehwien_pado/sampled.csv")
print(f"Total English entries: {len(df)}")
print("Native language distribution:")
i = 0
for lang, count in df['native_language'].value_counts().items():
    # print(f"  {lang}: {count}")
    print(f'  "{i}": "{lang}",')
    i += 1

Total English entries: 68000
Native language distribution:
  Arabic: 4000
  "0": "Arabic",
  Cantonese: 4000
  "1": "Cantonese",
  English: 4000
  "2": "English",
  French: 4000
  "3": "French",
  German: 4000
  "4": "German",
  Indonesian: 4000
  "5": "Indonesian",
  Italian: 4000
  "6": "Italian",
  Japanese: 4000
  "7": "Japanese",
  Korean: 4000
  "8": "Korean",
  Mandarin: 4000
  "9": "Mandarin",
  Polish: 4000
  "10": "Polish",
  Portuguese(Brazil): 4000
  "11": "Portuguese(Brazil)",
  Russian: 4000
  "12": "Russian",
  Spanish: 4000
  "13": "Spanish",
  Thai: 4000
  "14": "Thai",
  Traditional Chinese: 4000
  "15": "Traditional Chinese",
  Vietnamese: 4000
  "16": "Vietnamese",


### Sample same amount of entries for each language

In [12]:
import pandas as pd

df = pd.read_csv("Lang-8/stehwien_pado/sampled.csv")

# sample 5000 entries for each language, drop languages with less than 7000 entries
sampled_dfs = []
for lang, group in df.groupby('native_language'):
    if len(group) >= 4000:
        sampled_dfs.append(group.sample(n=4000, random_state=42))
        
sampled_df = pd.concat(sampled_dfs).reset_index(drop=True)
sampled_df.to_csv("Lang-8/stehwien_pado/sampled.csv", index=False)

### Split train and test

In [None]:
df = pd.read_csv("Lang-8/stehwien_pado/sampled.csv")

train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

train_df.to_parquet("Lang-8/stehwien_pado/train.parquet", index=False)
test_df.to_parquet("Lang-8/stehwien_pado/test.parquet", index=False)