In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
import re


In [2]:
train_examples = pd.read_csv("train.csv", names=['class', 'title', 'text'])

In [3]:
test_examples = pd.read_csv("test.csv", names=['class', 'title', 'text'])

In [4]:
#train_examples[train_examples['3']==2]

### CREATE DATASET FOR OPEN-NMT

In [5]:
train_examples = train_examples.sample(frac=1)

In [6]:
examples = pd.concat([train_examples, test_examples])

In [7]:
classes = list(examples['class'])
titles = list(examples['title'])
texts = list(examples['text'])

In [8]:
def remove_figures(sentence):
    replaced = re.sub('[0-9]([0-9]|\.|,|;| \ [0-9])*', 'NUMTOKEN', sentence)
    return replaced

In [9]:
def clean_title(title):
    title = title.replace("\\", " ")
    title = remove_figures(title)
    
    #Remove last part of title (name of journal)
    title = title.strip()
    if title[-1] == ')':
        position_left_parenthesis = title.rfind("(")
        title = title[:position_left_parenthesis]
    title = title.lower()
    title = " ".join(word_tokenize(title))
    title = title.replace("& lt ; b & gt ; ... & lt ; /b & gt ;", "")
    title = title.strip()

    return title

In [10]:
def clean_text(text):
    text = text.replace("\\", " ")
    text = text.replace('#39;s', "'")
    text = remove_figures(text)

    #Remove first part of text (name of journal or date)
    text = text.split(" - ",1)
    if len(text)>1 and len(text[0]) < len(text[1]):
        text = text[1]
    else:
        text = text[0]
        
    text = text.split(" -- ",1)
    if len(text)>1 and len(text[0]) < len(text[1]):
        text = text[1]
    else:
        text = text[0]
    
    text = text.lower()
    text = " ".join(word_tokenize(text))
    text = text.replace("#", "")
    text = text.replace("& lt ;", " ")
    text = text.replace("& gt ;", " ")
    return text

In [11]:
titles = [clean_title(title) for title in titles]

In [12]:
texts = [clean_text(text) for text in texts]

In [13]:
sentences = [titles[i]+" SEPTOKEN "+ texts[i] for i in range(len(titles))]

In [14]:
# sentences[:100]

In [15]:
validation_sentences = sentences[:6000]
validation_labels = classes[:6000]

train_sentences = sentences[6000:120000]
train_labels = classes[6000:120000]

test_sentences = sentences[120000:]
test_labels = classes[120000:]

In [16]:
len(test_sentences)

7600

In [17]:
import os

directory = ["ag_news_full", "ag_news_debug"]


for i in range(2):
    
    if i==1:
        train_sentences = train_sentences[:5000]
        train_labels = train_labels[:5000]
        validation_sentences = train_sentences[:5000]
        validation_labels = train_labels[:5000]
    
    name_dir = directory[i]
    if not os.path.exists(name_dir):
        os.makedirs(name_dir)

    with open(os.path.join(name_dir, "src_train.txt"), 'w') as f:
        for line in train_sentences:
            f.write(line+'\n')
        
    with open(os.path.join(name_dir, "src_val.txt"), 'w') as f:
        for line in validation_sentences:
            f.write(line+'\n')
        
    with open(os.path.join(name_dir, "tgt_train.txt"), 'w') as f:
        for line in train_labels:
            f.write(str(line)+'\n')
        
    with open(os.path.join(name_dir, "tgt_val.txt"), 'w') as f:
        for line in validation_labels:
            f.write(str(line)+'\n')

In [18]:
name_dir = "ag_news_full"

with open(os.path.join(name_dir, "src_test.txt"), 'w') as f:
    for line in test_sentences:
        f.write(line+'\n')

        
with open(os.path.join(name_dir, "tgt_test.txt"), 'w') as f:
    for line in test_labels:
        f.write(str(line)+'\n')