In [62]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter

In [63]:
df = pd.read_csv("bbc_text_cls.csv")

In [64]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [65]:
df.labels.unique()

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [66]:
labels = 'business'

In [67]:
texts = df[df['labels']==labels]["text"]

In [68]:
probs = Counter() #key: (w(t-1), w(t+1)), value: {w(t): count(w(t))}
for doc in texts:
    lines = doc.split("\n")
    for line in lines:
        tokens = word_tokenize(line)
        for i in range(len(tokens)-2):
            t_0 = tokens[i]
            t_1 = tokens[i+1]
            t_2 = tokens[i+2]
            key = (t_0,t_2)
            if key not in probs:
                probs[key]=Counter()
                if t_1 not in probs:
                    probs[key][t_1] = 1
                else:
                    probs[key][t_1] += 1

In [69]:
for key,d in probs.items():
    total = sum(d.values())
    for k,v in d.items():
        d[k] = v/total

In [70]:
probs

Counter({('Ad', 'boost'): Counter({'sales': 1.0}),
         ('sales', 'Time'): Counter({'boost': 1.0}),
         ('boost', 'Warner'): Counter({'Time': 1.0}),
         ('Time', 'profit'): Counter({'Warner': 1.0}),
         ('Quarterly', 'at'): Counter({'profits': 1.0}),
         ('profits', 'US'): Counter({'at': 1.0}),
         ('at', 'media'): Counter({'US': 1.0}),
         ('US', 'giant'): Counter({'media': 1.0}),
         ('media', 'TimeWarner'): Counter({'giant': 1.0}),
         ('giant', 'jumped'): Counter({'TimeWarner': 1.0}),
         ('TimeWarner', '76'): Counter({'jumped': 1.0}),
         ('jumped', '%'): Counter({'76': 1.0}),
         ('76', 'to'): Counter({'%': 1.0}),
         ('%', '$'): Counter({'to': 1.0}),
         ('to', '1.13bn'): Counter({'$': 1.0}),
         ('$', '('): Counter({'1.13bn': 1.0}),
         ('1.13bn', '£600m'): Counter({'(': 1.0}),
         ('(', ')'): Counter({'£600m': 1.0}),
         ('£600m', 'for'): Counter({')': 1.0}),
         (')', 'the'): Counter

In [71]:
texts.iloc[0].split("\n")

['Ad sales boost Time Warner profit',
 '',
 'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.',
 '',
 'The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.',
 '',
 "Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers a

In [72]:
def spin_document(doc):
    lines = doc.split("\n")
    output = []
    for line in lines:
        if line:
            new_line = spin_line(line)
        else:
            new_line = line
        output.append(new_line)
    return "\n".join(output)

In [73]:
detokenizer = TreebankWordDetokenizer()

In [74]:
texts.iloc[0].split("\n")[2]

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.'

In [75]:
detokenizer.detokenize(word_tokenize(texts.iloc[0].split("\n")[2]))

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.'

In [76]:
def sample_word(d):
    p0 = np.random.random()
    cumulative =0
    for k,v in d.items():
        cumulative += v
        if p0 < cumulative:
            return t
    assert(False)

In [77]:
def spin_line(line):
    tokens = word_tokenize(line)
    i = 0
    output = [tokens[0]]
    while i<len(tokens)-2:
        t_0 = tokens[i]
        t_1 = tokens[i+1]
        t_2 = tokens[i+2]
        key = (t_0,t_2)
        p_dist = probs[key]
        if len(p_dist)>1 and np.random.random() <0.3:
            middle = sample_word(p_dist)
            output.append(t_1)
            output.append("<" + middle + ">")
            output.append(t_2)

            i+=2
        else:
            output.append(t_1)
            i += 1
    if i == len(tokens)-2:
        output.append(tokens[-1])
    return detokenizer.detokenize(output)

In [78]:
np.random.seed(1234)

In [81]:
i = np.random.choice(texts.shape[0])
doc = texts.iloc[i]
new_doc = spin_document(doc)

In [82]:
new_doc

"MG Rover China tie-up 'delayed'\n\nMG Rover's proposed tie-up with China's top carmaker has been delayed due to concerns by Chinese regulators, according to the Financial Times.\n\nThe paper said Chinese officials had been irritated by Rover's disclosure of its talks with Shanghai Automotive Industry Corp in October . The proposed deal was seen as crucial to safeguarding the future of Rover's Longbridge plant in the West Midlands . However, there are growing fears that the deal could result in job losses . The Observer reported on Sunday that nearly half the workforce at Longbridge could be under threat if the deal goes ahead.\n\nShanghai Automotive's proposed £1bn investment in Rover is awaiting approval by its owner, the Shanghai city government and by the National Development and Reform Commission, which oversees foreign investment by Chinese firms . According to the FT, the regulator has been annoyed by Rover's decision to talk publicly about the deal and the intense speculation w