In [None]:
import numpy as np
import pandas as pd
import re
import json
import os
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import csv
import os
import psutil
import timeit
from datasets import load_dataset
import string
from sklearn.model_selection import train_test_split
import random

from utils import *

nltk.download('punkt')

random.seed(42)
np.random.seed(42)

In [None]:
!pip install datasets

In [None]:
# from google.colab import drive
# drive._mount('/content/drive')

# import os
# os.chdir('drive/MyDrive/machine_learning')

### I. Preprocess Raw Data

##### Pre-process news report data

In [None]:
# load news data

data_dir = 'datasets/raw-data/news_data'
data = []
for text_dir in os.listdir(data_dir):
    data += pd.read_csv(data_dir + '/' + text_dir, encoding = "utf-8", ).content.tolist()

data = preprocess(data, length_threshold = 5, select_long = True) # for news data, remove sentences within 5 words

# save processed data
with open('datasets/processed-data/data_news.txt',"w", encoding='utf-8') as f:
    f.write("\n".join("".join(map(str, x)) for x in data))

##### Pre-process dialogue data (Topical-Chat dataset)

In [None]:
# load second conversation data
with open("datasets/raw-data/Topical-Chat-master/Topical-Chat-master/conversations/train.json", encoding='utf-8') as f: 
    data_temp = json.load(f)

# extract only the conversations
data = []
for dicts in data_temp.keys():
    for content in data_temp[dicts]['content']:
        data.append(content['message'])
        
data = preprocess(data, select_long = False, length_threshold = 3) # for dialogue data, remove sentences within 3 words

# save processed data
with open('datasets/processed-data/data_dialog.txt',"w", encoding='utf-8') as f:
    f.write("\n".join("".join(map(str, x)) for x in data))

##### Pre-process Ted Talk data

In [None]:
# load English text from ted dataset
df = pd.read_csv('datasets/raw-data/ted2020/ted2020.tsv', sep='\t', keep_default_na=False, encoding='utf8', quoting=csv.QUOTE_NONE)
eng_data = df.en.values.flatten().tolist()
        
data = preprocess(eng_data, select_long = False) # for dialogue data, remove sentences within 3 words

# save processed data
with open('datasets/processed-data/data_ted.txt',"w", encoding='utf-8') as f:
    f.write("\n".join("".join(map(str, x)) for x in data))

#####  Pre-process Wikipedia data

In [None]:
# load wikipedia data
wiki = load_dataset("wikipedia", "20200501.en", split='train')

# select 200,000 random topics
index = np.random.randint(len(wiki), size=int(2e5))

# preprocess raw text of each selected topic
data_final = []

for number, i in enumerate(index):
    text = wiki[int(i)]['text']
    processed_text  = preprocess_text(text)
    data_final.extend(processed_text)
    print('\r{}/{} complete'.format(number, len(index)), end='', flush=True)
    
print('Got {} sentences'.format(len(data_final)))

# save processed data
with open('datasets/processed-data/data_wikipedia.txt',"w", encoding='utf-8') as f:
    f.write("\n".join("".join(map(str, x)) for x in data_final))

### II. Construct positive and negative samples

##### Load data

In [None]:
# load news data
with open("datasets/processed-data/data_news.txt", encoding='utf-8') as f: 
    data_news = f.readlines() 
# ignore '\n'
data_news = [sentence[:-1] for sentence in data_news]
data_news = list(set(data_news))


# load dialog data
with open("datasets/processed-data/data_dialog.txt", encoding='utf-8') as f: 
    data_dialog = f.readlines()
# ignore '\n'
data_dialog = [sentence[:-1] for sentence in data_dialog]
data_dialog = list(set(data_dialog))


# load ted data
with open("datasets/processed-data/data_ted.txt", encoding='utf-8') as f: 
    data_ted = f.readlines()
# ignore '\n'
data_ted = [sentence[:-1] for sentence in data_ted]
data_ted = list(set(data_ted))


# # load wikipedia data
with open("datasets/processed-data/data_wikidata.txt", encoding='utf-8') as f: 
    data_wikipedia = f.readlines()
# ignore '\n'
data_wikipedia = [sentence[:-1] for sentence in data_wikipedia]
data_wikipedia = list(set(data_wikipedia))

# shuffle the data
random.shuffle(data_news)
random.shuffle(data_dialog)
random.shuffle(data_ted)
random.shuffle(data_wikipedia)
print('Got {} news text data, {} dialog data, {} ted data, {} wikipedia data'.format(len(data_news), len(data_dialog), len(data_ted), len(data_wikipedia)))

##### Create the standard dataset (~ 1 million sentences)

In [None]:
# retrieve the sentences from the 4 datasets respectively
data_all = data_news[0:500000] + data_ted[0:500000] + data_wikipedia[0:500000] + data_dialog[0:250000]
# remove duplicates
data_all = list(set(data_all))
# shuffle
random.shuffle(data_all)
# keep only 1.1 million sentences
data_all = data_all[0:1100000]

In [None]:
# create positive and negative samples

# create positive samples
pos_samples = []
for i, sentence in enumerate(data_all):
    pos_samples.append(clean_punctuation(sentence))
    print('\r-------- {}/{} positive samples generated ----------'.format(i+1, len(data_all)), flush=True, end='')
    
pos_samples = list(set(pos_samples) - set(''))
pos_labels = len(pos_samples) * ['complete sentence']

# create negative samples
neg_samples, neg_labels = generate_negative_samples(data_all.copy())

print('Got {} positive samples and {} negative samples'.format(len(pos_samples), len(neg_samples)))

In [None]:
# combine positive and negative samples
samples = pos_samples + neg_samples
labels = pos_labels + neg_labels

dataset = pd.DataFrame()
dataset['data'] = samples
dataset['label'] = labels

# drop na, duplicates, and sentences longer than 100 words
dataset.dropna(inplace=True)
dataset.drop_duplicates(subset=['data','label'],keep='first',inplace=True)
dataset['length'] = [len(sent.split(' ')) for sent in dataset['data']]
dataset.drop(dataset[dataset['length'] > 100].index, inplace=True)
dataset.drop(axis=1, columns='length', inplace=True)

In [None]:
# save the dataset with multiple labels for negative samples
dataset.to_csv('datasets/labeled-data/dataset_multi_label.csv', index=0)

# save the dataset with binary labels (complete sentence or not)
dataset['label'] = (dataset['label']=='complete sentence') + 0
dataset.to_csv('datasets/labeled-data/dataset_binary_label.csv', index=0)

In [None]:
# create training and test set for binary label

dataset = pd.read_csv('datasets/labeled-data/dataset_binary_label.csv')
dataset.dropna(inplace=True)

X, y = dataset.data.tolist(), dataset.label.tolist()

# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# split 10% of tranining set as ablation set
X_train, X_ab, y_train, y_ab = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# save the datasets
dataset_train = pd.DataFrame({'data': X_train, 'label':y_train})
dataset_ablation = pd.DataFrame({'data': X_ab, 'label':y_ab})
dataset_test = pd.DataFrame({'data': X_test, 'label':y_test})

dataset_train.to_csv('datasets/labeled-data/dataset_binary_train.csv', index=0)
dataset_ablation.to_csv('datasets/labeled-data/dataset_binary_ablation.csv', index=0)
dataset_test.to_csv('datasets/labeled-data/dataset_binary_test.csv', index=0)

In [None]:
# create training and test set for multi label

dataset = pd.read_csv('datasets/labeled-data/dataset_multi_label.csv')
dataset.dropna(inplace=True)

X, y = dataset.data.tolist(), dataset.label.tolist()
# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# split 10% of tranining set as ablation set
X_train, X_ab, y_train, y_ab = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

dataset_train = pd.DataFrame({'data': X_train, 'label':y_train})
dataset_ablation = pd.DataFrame({'data': X_ab, 'label':y_ab})
dataset_test = pd.DataFrame({'data': X_test, 'label':y_test})

dataset_train.to_csv('datasets/labeled-data/dataset_multi_train.csv', index=0)
dataset_ablation.to_csv('datasets/labeled-data/dataset_multi_ablation.csv', index=0)
dataset_test.to_csv('datasets/labeled-data/dataset_multi_test.csv', index=0)

def convert_to_number(y):
    """ transform the string labels into numbers """
    mapping = {'complete sentence': 0,
               'random cut': 1,
               'random missing': 2,
               'random repeat': 3,
               'random replace': 4}
    y_number = [mapping[i] if i in mapping.keys() else i for i in y]
    return y_number

# transform the string labels into numbers
y_train = convert_to_number(y_train)
y_ab = convert_to_number(y_ab)
y_test = convert_to_number(y_test)

# save the datasets
dataset_train = pd.DataFrame({'data': X_train, 'label':y_train})
dataset_ablation = pd.DataFrame({'data': X_ab, 'label':y_ab})
dataset_test = pd.DataFrame({'data': X_test, 'label':y_test})

dataset_train.to_csv('datasets/labeled-data/dataset_multi_num_train.csv', index=0)
dataset_ablation.to_csv('datasets/labeled-data/dataset_multi_num_ablation.csv', index=0)
dataset_test.to_csv('datasets/labeled-data/dataset_multi_num_test.csv', index=0)

##### Create the large dataset (~ 5 million sentences)

In [None]:
# retrieve the sentences from the 4 datasets respectively
data_all = data_news + data_ted + data_wikipedia[0:2000000] + data_dialog
# remove duplicates
data_all = list(set(data_all))
# shuffle
random.shuffle(data_all)
len(data_all)

In [None]:
# create positive and negative samples

# create positive samples
pos_samples = []
for i, sentence in enumerate(data_all):
    pos_samples.append(clean_punctuation(sentence))
    print('\r-------- {}/{} positive samples generated ----------'.format(i+1, len(data_all)), flush=True, end='')
pos_samples = list(set(pos_samples) - set(''))
pos_labels = len(pos_samples) * ['complete sentence']

# create negative samples
neg_samples, neg_labels = generate_negative_samples(data_all.copy())

print('Got {} positive samples and {} negative samples'.format(len(pos_samples), len(neg_samples)))

In [None]:
# combine positive and negative samples
samples = pos_samples + neg_samples
labels = pos_labels + neg_labels

dataset = pd.DataFrame()
dataset['data'] = samples
dataset['label'] = labels

# drop na, duplicates, and sentences longer than 100 words
dataset.dropna(inplace=True)
dataset.drop_duplicates(subset=['data','label'],keep='first',inplace=True)
dataset['length'] = [len(sent.split(' ')) for sent in dataset['data']]
dataset.drop(dataset[dataset['length'] > 100].index, inplace=True)
dataset.drop(axis=1, columns='length', inplace=True)

In [None]:
# save the dataset with multiple labels for negative samples
dataset.to_csv('datasets/labeled-data/dataset_multi_label_large.csv', index=0)

# save the dataset with binary labels (complete sentence or not)
dataset['label'] = (dataset['label']=='complete sentence') + 0
dataset.to_csv('datasets/labeled-data/dataset_binary_label_large.csv', index=0)

In [None]:
# create training and test set for binary label

dataset = pd.read_csv('datasets/labeled-data/dataset_binary_label_large.csv')
dataset.dropna(inplace=True)

X, y = dataset.data.tolist(), dataset.label.tolist()

# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# save the datasets
dataset_train = pd.DataFrame({'data': X_train, 'label':y_train})
dataset_test = pd.DataFrame({'data': X_test, 'label':y_test})

dataset_train.to_csv('datasets/labeled-data/dataset_binary_train_large.csv', index=0)
dataset_test.to_csv('datasets/labeled-data/dataset_binary_test_large.csv', index=0)