In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
import re


In [2]:
train_examples = pd.read_csv("train.csv", names=['class', 'title', 'text'])

In [3]:
test_examples = pd.read_csv("test.csv", names=['class', 'title', 'text'])

In [4]:
#train_examples[train_examples['3']==2]

### CREATE DATASET FOR OPEN-NMT

In [5]:
train_examples = train_examples.sample(frac=1)

In [6]:
examples = pd.concat([train_examples, test_examples])

In [7]:
classes = list(examples['class'])
titles = list(examples['title'])
texts = list(examples['text'])

In [8]:
def clean_title(title):
    title = title.replace("\\", " ")
    
    #Remove last part of title (name of journal)
    title = title.strip()
    if title[-1] == ')':
        position_left_parenthesis = title.rfind("(")
        title = title[:position_left_parenthesis]
    title = title.lower()
    title = " ".join(word_tokenize(title))
    title = title.replace("& lt ; b & gt ; ... & lt ; /b & gt ;", "")
    title = title.strip()

    return title

In [9]:
def clean_text(text):
    text = text.replace("\\", " ")
    text = text.replace('#39;s', "'")

    #Remove first part of text (name of journal or date)
    text = text.split(" - ",1)
    if len(text)>1 and len(text[0]) < len(text[1]):
        text = text[1]
    else:
        text = text[0]
        
    text = text.split(" -- ",1)
    if len(text)>1 and len(text[0]) < len(text[1]):
        text = text[1]
    else:
        text = text[0]
    
    text = text.lower()
    text = " ".join(word_tokenize(text))
    text = text.replace("#", "")
    text = text.replace("& lt ;", " ")
    text = text.replace("& gt ;", " ")
    return text

In [10]:
titles = [clean_title(title) for title in titles]

In [11]:
texts = [clean_text(text) for text in texts]

In [12]:
titles

['oracle sex suit winner seeks $ 679,000 , apology',
 'oil worries remain , despite price slide',
 'mozilla gains five points , ie slips five',
 'touch wood',
 'uk hostage # 39 ; in hands of new group # 39 ;',
 'blackberry infringement ruling upheld',
 'greene seeks revenge for olympic defeat',
 'building green',
 'google , 5 big libraries team to offer books',
 "philly folly on packers ' minds",
 'advice to microsoft : learn to love linux',
 'los angeles dodgers team report - september 26',
 'hoops team rebounds',
 'pg amp ; e talks to residents about powerline project',
 'bryant accuses malone of hitting on wife',
 'nasa chief o # 39 ; keefe resigns',
 'report : gamecocks to name spurrier new head coach tuesday',
 'dow knocked lower by merck # 39 ; s drug recall',
 'most home pc users at risk for attack',
 'senior chinese leader to visit n.korea next week',
 'bringing order to the fungus among us',
 'cambodia # 39 ; s new king , norodom sihamoni , arrives home',
 'this travel search 

In [13]:
sentences = [titles[i]+" SEPTOKEN "+ texts[i] for i in range(len(titles))]

In [14]:
sent = sentences[0]
sent

'oracle sex suit winner seeks $ 679,000 , apology SEPTOKEN it sales account manager in england seeks compensation after winning her sex discrimination case .'

In [15]:
sent2 = " ".join(sent)
# sent2.replace("  ", "SP")
sent2.replace("  S E P T O K E N  ", "SEPTOKEN").replace("   ", " SP ")


'o r a c l e SP s e x SP s u i t SP w i n n e r SP s e e k s SP $ SP 6 7 9 , 0 0 0 SP , SP a p o l o g y SEPTOKEN i t SP s a l e s SP a c c o u n t SP m a n a g e r SP i n SP e n g l a n d SP s e e k s SP c o m p e n s a t i o n SP a f t e r SP w i n n i n g SP h e r SP s e x SP d i s c r i m i n a t i o n SP c a s e SP .'

In [16]:
for i in range(len(sentences)):
    sentences[i] = " ".join(sentences[i]).replace("  S E P T O K E N  ", "SEPTOKEN").replace("   ", " SP ")

In [17]:
sentences[:100]

['o r a c l e SP s e x SP s u i t SP w i n n e r SP s e e k s SP $ SP 6 7 9 , 0 0 0 SP , SP a p o l o g y SEPTOKEN i t SP s a l e s SP a c c o u n t SP m a n a g e r SP i n SP e n g l a n d SP s e e k s SP c o m p e n s a t i o n SP a f t e r SP w i n n i n g SP h e r SP s e x SP d i s c r i m i n a t i o n SP c a s e SP .',
 'o i l SP w o r r i e s SP r e m a i n SP , SP d e s p i t e SP p r i c e SP s l i d e SEPTOKEN w i t h SP c r u d e SP o i l SP p r i c e s SP d r i f t i n g SP d o w n SP t o SP a s SP l o w SP a s SP $ SP 4 1 SP p e r SP b a r r e l SP , SP o p e c SP i s SP e x p e c t e d SP b y SP m a n y SP o b s e r v e r s SP t o SP s l a s h SP p r o d u c t i o n SP , SP w h e n SP i t SP m e e t s SP o n SP f r i d a y SP .',
 "m o z i l l a SP g a i n s SP f i v e SP p o i n t s SP , SP i e SP s l i p s SP f i v e SEPTOKEN w e b SP a n a l y t i c s SP p r o v i d e r SP o n e s t a t . c o m SP h a s SP a n n o u n c e d SP t h a t SP m o z i l l a SP ' SP b r o w s

In [18]:
validation_sentences = sentences[:6000]
validation_labels = classes[:6000]

train_sentences = sentences[6000:120000]
train_labels = classes[6000:120000]

test_sentences = sentences[120000:]
test_labels = classes[120000:]

In [19]:
len(test_sentences)

7600

In [20]:
import os

directory = ["ag_news_char_full", "ag_news_char_debug"]


for i in range(2):
    
    if i==1:
        train_sentences = train_sentences[:5000]
        train_labels = train_labels[:5000]
        validation_sentences = train_sentences[:5000]
        validation_labels = train_labels[:5000]
    
    name_dir = directory[i]
    if not os.path.exists(name_dir):
        os.makedirs(name_dir)

    with open(os.path.join(name_dir, "src_train.txt"), 'w') as f:
        for line in train_sentences:
            f.write(line+'\n')
        
    with open(os.path.join(name_dir, "src_val.txt"), 'w') as f:
        for line in validation_sentences:
            f.write(line+'\n')
        
    with open(os.path.join(name_dir, "tgt_train.txt"), 'w') as f:
        for line in train_labels:
            f.write(str(line)+'\n')
        
    with open(os.path.join(name_dir, "tgt_val.txt"), 'w') as f:
        for line in validation_labels:
            f.write(str(line)+'\n')

In [21]:
name_dir = "ag_news_char_full"

with open(os.path.join(name_dir, "src_test.txt"), 'w') as f:
    for line in test_sentences:
        f.write(line+'\n')

        
with open(os.path.join(name_dir, "tgt_test.txt"), 'w') as f:
    for line in test_labels:
        f.write(str(line)+'\n')